def get_menus(self, text, year, week_number): menus = {} count = 0 lines = text.replace("Extraessen", "").splitlines() for line in lines: if "Montag" in line: break count += 1 lines = lines[count:] # get rid of Zusatzstoffe and Allergene: everything below the last ***-delimiter is irrelevant last_relevant_line = len(lines) for index, line in enumerate(lines): if "***" in line: last_relevant_line = index lines = lines[:last_relevant_line] days_list = [d for d in re.split(r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s\d{1,2}.\d{1,2}.\d{4}", "\n".join(lines).replace("*", "").strip()) if d not in ["", "Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"]] if len(days_list) != 7: # as the Mediziner Mensa is part of hospital, it should serve food on each day return None days = {"mon": days_list[0], "tue": days_list[1], "wed": days_list[2], "thu": days_list[3], "fri": days_list[4], "sat": days_list[5], "sun": days_list[6]} for key in days: day_lines = unicodedata.normalize("NFKC", days[key]).splitlines(True) soup_str = "" mains_str = "" for day_line in day_lines: soup_str += day_line[:36].strip() + "\n" mains_str += day_line[40:100].strip() + "\n" soup_str = soup_str.replace("-\n", "").strip().replace("\n", " ") soup = self.parse_dish(soup_str) dishes = [] if (soup.name not in ["", "Feiertag"]): dishes.append(soup) # https://regex101.com/r/MDFu1Z/1 for dish_str in re.split(r"(\n{2,}|(?<!mit)\n(?=[A-Z]))", mains_str): dish_str = dish_str.strip().replace("\n", " ") dish = self.parse_dish(dish_str) dish.name = dish.name.strip() if dish.name not in ["", "Feiertag"]: dishes.append(dish) date = self.get_date(year, week_number, self.weekday_positions[key]) menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus
def get_menus(self, text, year, week_number): menus = {} lines = text.splitlines() count = 0 # remove headline etc. for line in lines: # Find the line which is the header of the table and includes the day of week line_shrink = line.replace(" ", "").replace("\n", "").lower() # Note we do not include 'montag' und 'freitag' since they are also used in the line before the table # header to indicate the range of the week “Monday … until Friday _” if any(x in line_shrink for x in ('dienstag', 'mittwoch', 'donnerstag')): break count += 1 else: warn( "NotImplemented: IPP parsing failed. Menu text is not a weekly menu. First line: '{}'" .format(lines[0])) return None lines = lines[count:] weekdays = lines[0] # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of # every column. However, due to center alignment the column do not begin at the 'T' character and broader # text in the column might be left of this character, which then gets truncated. But the gap between the 'T' # and the '€' character of the previous column¹ — the real beginning of the current column — is always three, # which will be subtracted here. Monday is the second column, so the value should never become negative # although it is handled here. # ¹or 'e' of "Internationale Küche" if it is the monday column # find lines which match the regex # lines[1:] == exclude the weekday line which also can contain `Geschlossen` soup_lines_iter = (x for x in lines[1:] if self.split_days_regex.search(x)) soup_line1 = next(soup_lines_iter) soup_line2 = next(soup_lines_iter, '') # Sometimes on closed days, the keywords are written instead of the week of day instead of the soup line positions1 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, weekdays)) ] positions2 = [(max(a.start() - 3, 0), a.end()) for a in list( re.finditer(self.split_days_regex_soup_one_line, soup_line1))] # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or # closed days ("Geschlossen", "Feiertag") positions3 = [(max(a.start() - 14, 0), a.end() + 3) for a in list( re.finditer(self.split_days_regex_soup_two_line, soup_line2))] # closed days ("Geschlossen", "Feiertag", …) can be in first line and second line positions4 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, soup_line1)) + list(re.finditer(self.split_days_regex_closed, soup_line2)) ] if positions3: # Two lines "Tagessuppe siehe Aushang" soup_line_index = lines.index(soup_line2) else: soup_line_index = lines.index(soup_line1) positions = sorted(positions1 + positions2 + positions3 + positions4) if len(positions) != 5: warn( "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected." .format(week_number, year, len(positions))) return None pos_mon = positions[0][0] pos_tue = positions[1][0] pos_wed = positions[2][0] pos_thu = positions[3][0] pos_fri = positions[4][0] lines_weekdays = { "mon": "", "tue": "", "wed": "", "thu": "", "fri": "" } # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the # soups) instead of the first menu, if there is a day where the bistro is closed. for line in lines[soup_line_index + 3:]: lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace( "\n", " ") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace( "\n", " ") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace( "\n", " ") lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace( "\n", " ") lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ") for key in lines_weekdays: # Appends `?€` to „Überraschungsmenü“ if it do not have a price. The second '€' is a separator for the # later split lines_weekdays[key] = self.surprise_without_price_regex.sub( r"\g<1>?€ € \g<2>", lines_weekdays[key]) # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT) lines_weekdays[key] = unicodedata.normalize( "NFKC", lines_weekdays[key]) # remove multi-whitespaces lines_weekdays[key] = ' '.join(lines_weekdays[key].split()) # get all dish including name and price dish_names_price = re.findall(self.dish_regex, lines_weekdays[key] + ' ') # create dish types # since we have the same dish types every day we can use them if there are 4 dishes available if len(dish_names_price) == 4: dish_types = [ "Veggie", "Traditionelle Küche", "Internationale Küche", "Specials" ] else: dish_types = ["Tagesgericht"] * len(dish_names_price) # create ingredients # all dishes have the same ingridients ingredients = Ingredients("ipp-bistro") ingredients.parse_ingredients("Mi,Gl,Sf,Sl,Ei,Se,4") # create list of Dish objects counter = 0 dishes = [] for (dish_name, price) in dish_names_price: dishes.append( Dish(dish_name.strip(), Prices(Price(price.replace(',', '.').strip())), ingredients.ingredient_set, dish_types[counter])) counter += 1 date = self.get_date(year, week_number, self.weekday_positions[key]) # create new Menu object and add it to dict menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus
def get_menus(self, text, year, week_number): menus = {} lines = text.splitlines() count = 0 # remove headline etc. for line in lines: if line.replace(" ", "").replace( "\n", "").lower() == "montagdienstagmittwochdonnerstagfreitag": break count += 1 lines = lines[count:] # we assume that the weeksdays are now all in the first line pos_mon = lines[0].find("Montag") pos_tue = lines[0].find("Dienstag") pos_wed = lines[0].find("Mittwoch") pos_thu = lines[0].find("Donnerstag") pos_fri = lines[0].find("Freitag") # The text is formatted as table using whitespaces. Hence, we need to get those parts of each line that refer # to the respective week day lines_weekdays = { "mon": "", "tue": "", "wed": "", "thu": "", "fri": "" } for line in lines: lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace( "\n", " ").replace("Montag", "") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace( "\n", " ").replace("Dienstag", "") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace( "\n", " ").replace("Mittwoch", "") lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace( "\n", " ").replace("Donnerstag", "") lines_weekdays["fri"] += " " + line[pos_fri:].replace( "\n", " ").replace("Freitag", "") # currently, up to 5 dishes are on the menu num_dishes = 5 line_aktion = [] if year < 2018: # in older versions of the FMI Bistro menu, the Aktionsgericht was the same for the whole week num_dishes = 3 line_aktion = [s for s in lines if "Aktion" in s] if len(line_aktion) == 1: line_aktion_pos = lines.index(line_aktion[0]) - 2 aktionsgericht = ' '.join( lines[line_aktion_pos:line_aktion_pos + 3]) aktionsgericht = aktionsgericht \ .replace("Montag – Freitag", "") \ .replace("Tagessuppe täglich wechselndes Angebot", "") \ .replace("ab € 1,00", "") \ .replace("Aktion", "") num_dishes += aktionsgericht.count('€') for key in lines_weekdays: lines_weekdays[ key] = aktionsgericht + ", " + lines_weekdays[key] # Process menus for each day for key in lines_weekdays: # stop parsing day when bistro is closed at that day if "geschlossen" in lines_weekdays[key].lower(): continue # extract all allergens dish_allergens = [] for x in re.findall(self.allergens_regex, lines_weekdays[key]): if len(x) > 0: dish_allergens.append( re.sub(r"((Allergene:)|\s|\n)*", "", x[0])) else: dish_allergens.append("") lines_weekdays[key] = re.sub(self.allergens_regex, "", lines_weekdays[key]) # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT) lines_weekdays[key] = unicodedata.normalize( "NFKC", lines_weekdays[key]) # remove multi-whitespaces lines_weekdays[key] = ' '.join(lines_weekdays[key].split()) # remove no allergens indicator lines_weekdays[key] = lines_weekdays[key].replace("./.", "") # get all dish including name and price dish_names = re.findall(self.dish_regex, lines_weekdays[key]) # get dish prices prices = re.findall(self.price_regex, ' '.join(dish_names)) # convert prices to float prices = [ Prices( Price( float( price.replace("€", "").replace(",", ".").strip()))) for price in prices ] # remove price and commas from dish names dish_names = [ re.sub(self.price_regex, "", dish).replace(",", "").strip() for dish in dish_names ] # create list of Dish objects; only take first 3/4 as the following dishes are corrupt and not necessary dishes = [] for (dish_name, price, dish_allergen) in list(zip(dish_names, prices, dish_allergens)): # filter empty dishes if dish_name: ingredients = Ingredients("fmi-bistro") ingredients.parse_ingredients(dish_allergen) dishes.append( Dish(dish_name, price, ingredients.ingredient_set, "Tagesgericht")) dishes = dishes[:num_dishes] date = self.get_date(year, week_number, self.weekday_positions[key]) # create new Menu object and add it to dict menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus
def get_menus(self, text, year, week_number): menus = {} lines = text.splitlines() count = 0 # remove headline etc. for line in lines: if line.replace(" ", "").replace( "\n", "").lower() == "montagdienstagmittwochdonnerstagfreitag": break count += 1 lines = lines[count:] weekdays = lines[0] # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of # every column. However, due to center alignment the column do not begin at the 'T' character and broader # text in the column might be left of this character, which then gets truncated. But the gap between the 'T' # and the '€' character of the previous column¹ — the real beginning of the current column — is always three, # which will be subtracted here. Monday is the second column, so the value should never become negative # although it is handled here. # ¹or 'e' of "Internationale Küche" if it is the monday column # find lines which match the regex soup_lines_iter = (x for x in lines if self.split_days_regex.search(x)) soup_line1 = next(soup_lines_iter) soup_line2 = next(soup_lines_iter, '') positions1 = [(max(a.start() - 3, 0), a.end()) for a in list( re.finditer(self.split_days_regex_soup_one_line, soup_line1))] # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or # closed days ("Geschlossen", "Feiertag") positions2 = [(max(a.start() - 14, 0), a.end() + 3) for a in list( re.finditer(self.split_days_regex_soup_two_line, soup_line2))] positions3 = [(max(a.start() - 3, 0), a.end()) for a in list( re.finditer(self.split_days_regex_closed, soup_line2))] if positions2: # Two lines "Tagessuppe siehe Aushang" soup_line_index = lines.index(soup_line2) else: soup_line_index = lines.index(soup_line1) positions = sorted(positions1 + positions2 + positions3) if len(positions) != 5: warn( "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected." .format(week_number, year, len(positions))) return None pos_mon = positions[0][0] pos_tue = positions[1][0] pos_wed = positions[2][0] pos_thu = positions[3][0] pos_fri = positions[4][0] lines_weekdays = { "mon": "", "tue": "", "wed": "", "thu": "", "fri": "" } # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the # soups) instead of the first menu, if there is a day where the bistro is closed. for line in lines[soup_line_index + 3:]: lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace( "\n", " ") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace( "\n", " ") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace( "\n", " ") lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace( "\n", " ") lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ") for key in lines_weekdays: # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT) lines_weekdays[key] = unicodedata.normalize( "NFKC", lines_weekdays[key]) # remove multi-whitespaces lines_weekdays[key] = ' '.join(lines_weekdays[key].split()) # get all dish including name and price dish_names = re.findall(self.dish_regex, lines_weekdays[key] + " ") # get dish prices prices = re.findall(self.price_regex, ' '.join(dish_names)) # convert prices to float prices = [ float(price.replace("€", "").replace(",", ".").strip()) for price in prices ] # remove price and commas from dish names dish_names = [ re.sub(self.price_regex, "", dish).strip() for dish in dish_names ] # create list of Dish objects dishes = [ Dish(dish_name, price) for (dish_name, price) in list(zip(dish_names, prices)) ] date = self.get_date(year, week_number, self.weekday_positions[key]) # create new Menu object and add it to dict menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus