Exemple #1
0
    def parse_dish(self, dish_str):
        # ingredients
        dish_ingredients = Ingredients("mediziner-mensa")
        matches = re.findall(self.ingredients_regex, dish_str)
        while len(matches) > 0:
            for x in matches:
                if len(x) > 0:
                    dish_ingredients.parse_ingredients(x[0])
            dish_str = re.sub(self.ingredients_regex, " ", dish_str)
            matches = re.findall(self.ingredients_regex, dish_str)
        dish_str = re.sub(r"\s+", " ", dish_str).strip()
        dish_str = dish_str.replace(" , ", ", ")

        # price
        dish_price = Prices()
        for x in re.findall(self.price_regex, dish_str):
            if len(x) > 0:
                dish_price = Prices(
                    Price(
                        float(x[0].replace("€", "").replace(",",
                                                            ".").strip())))
        dish_str = re.sub(self.price_regex, "", dish_str)

        return Dish(dish_str, dish_price, dish_ingredients.ingredient_set,
                    "Tagesgericht")
Exemple #2
0
    def __parse_dishes(menu_html, location):
        # obtain the names of all dishes in a passed menu
        dish_names = [
            dish.rstrip() for dish in menu_html.xpath(
                "//p[@class='js-schedule-dish-description']/text()")
        ]
        # make duplicates unique by adding (2), (3) etc. to the names
        dish_names = util.make_duplicates_unique(dish_names)
        # obtain the types of the dishes (e.g. 'Tagesgericht 1')
        dish_types = [
            type.text if type.text else ''
            for type in menu_html.xpath("//span[@class='stwm-artname']")
        ]
        # obtain all ingredients
        dish_markers_additional = menu_html.xpath(
            "//span[contains(@class, 'c-schedule__marker--additional')]/@data-essen"
        )
        dish_markers_allergen = menu_html.xpath(
            "//span[contains(@class, 'c-schedule__marker--allergen')]/@data-essen"
        )
        dish_markers_type = menu_html.xpath(
            "//span[contains(@class, 'c-schedule__marker--type')]/@data-essen")

        # create dictionary out of dish name and dish type
        dishes_dict = {}
        dishes_tup = zip(dish_names, dish_types, dish_markers_additional,
                         dish_markers_allergen, dish_markers_type)
        for dish_name, dish_type, dish_marker_additional, dish_marker_allergen, dish_marker_type in dishes_tup:
            dishes_dict[dish_name] = (dish_type, dish_marker_additional,
                                      dish_marker_allergen, dish_marker_type)

        # create Dish objects with correct prices; if price is not available, -1 is used instead
        dishes = []
        for name in dishes_dict:
            if not dishes_dict[name] and dishes:
                # some dishes are multi-row. That means that for the same type the dish is written in multiple rows.
                # From the second row on the type is then just empty. In that case, we just use the price and
                # ingredients of the previous dish.
                dishes.append(
                    Dish(name, dishes[-1].price, dishes[-1].ingredients,
                         dishes[-1].dish_type))
            else:
                dish_ingredients = Ingredients(location)
                dish_ingredients.parse_ingredients(dishes_dict[name][1])
                dish_ingredients.parse_ingredients(dishes_dict[name][2])
                dish_ingredients.parse_ingredients(dishes_dict[name][3])
                dishes.append(
                    Dish(
                        name,
                        StudentenwerkMenuParser.prices.get(
                            dishes_dict[name][0], "N/A"),
                        dish_ingredients.ingredient_set, dishes_dict[name][0]))

        return dishes
Exemple #3
0
    def get_menus(self, text, year, week_number):
        menus = {}
        lines = text.splitlines()
        count = 0
        # remove headline etc.
        for line in lines:
            # Find the line which is the header of the table and includes the day of week
            line_shrink = line.replace(" ", "").replace("\n", "").lower()
            # Note we do not include 'montag' und 'freitag' since they are also used in the line before the table
            # header to indicate the range of the week “Monday … until Friday _”
            if any(x in line_shrink
                   for x in ('dienstag', 'mittwoch', 'donnerstag')):
                break

            count += 1

        else:
            warn(
                "NotImplemented: IPP parsing failed. Menu text is not a weekly menu. First line: '{}'"
                .format(lines[0]))
            return None

        lines = lines[count:]
        weekdays = lines[0]

        # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of
        # every column. However, due to center alignment the column do not begin at the 'T' character and broader
        # text in the column might be left of this character, which then gets truncated. But the gap between the 'T'
        # and the '€' character of the previous column¹ — the real beginning of the current column — is always three,
        # which will be subtracted here. Monday is the second column, so the value should never become negative
        # although it is handled here.
        # ¹or 'e' of "Internationale Küche" if it is the monday column

        # find lines which match the regex
        # lines[1:] == exclude the weekday line which also can contain `Geschlossen`
        soup_lines_iter = (x for x in lines[1:]
                           if self.split_days_regex.search(x))

        soup_line1 = next(soup_lines_iter)
        soup_line2 = next(soup_lines_iter, '')

        # Sometimes on closed days, the keywords are written instead of the week of day instead of the soup line
        positions1 = [
            (max(a.start() - 3, 0), a.end())
            for a in list(re.finditer(self.split_days_regex_closed, weekdays))
        ]

        positions2 = [(max(a.start() - 3, 0), a.end()) for a in list(
            re.finditer(self.split_days_regex_soup_one_line, soup_line1))]
        # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or
        # closed days ("Geschlossen", "Feiertag")
        positions3 = [(max(a.start() - 14, 0), a.end() + 3) for a in list(
            re.finditer(self.split_days_regex_soup_two_line, soup_line2))]
        # closed days ("Geschlossen", "Feiertag", …) can be in first line and second line
        positions4 = [
            (max(a.start() - 3, 0), a.end()) for a in
            list(re.finditer(self.split_days_regex_closed, soup_line1)) +
            list(re.finditer(self.split_days_regex_closed, soup_line2))
        ]

        if positions3:  # Two lines "Tagessuppe siehe Aushang"
            soup_line_index = lines.index(soup_line2)
        else:
            soup_line_index = lines.index(soup_line1)

        positions = sorted(positions1 + positions2 + positions3 + positions4)

        if len(positions) != 5:
            warn(
                "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected."
                .format(week_number, year, len(positions)))
            return None

        pos_mon = positions[0][0]
        pos_tue = positions[1][0]
        pos_wed = positions[2][0]
        pos_thu = positions[3][0]
        pos_fri = positions[4][0]

        lines_weekdays = {
            "mon": "",
            "tue": "",
            "wed": "",
            "thu": "",
            "fri": ""
        }
        # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the
        # soups) instead of the first menu, if there is a day where the bistro is closed.
        for line in lines[soup_line_index + 3:]:
            lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace(
                "\n", " ")
            lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace(
                "\n", " ")
            lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace(
                "\n", " ")
            lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace(
                "\n", " ")
            lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ")

        for key in lines_weekdays:
            # Appends `?€` to „Überraschungsmenü“ if it do not have a price. The second '€' is a separator for the
            # later split
            lines_weekdays[key] = self.surprise_without_price_regex.sub(
                r"\g<1>?€ € \g<2>", lines_weekdays[key])
            # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT)
            lines_weekdays[key] = unicodedata.normalize(
                "NFKC", lines_weekdays[key])
            # remove multi-whitespaces
            lines_weekdays[key] = ' '.join(lines_weekdays[key].split())
            # get all dish including name and price
            dish_names_price = re.findall(self.dish_regex,
                                          lines_weekdays[key] + ' ')
            # create dish types
            # since we have the same dish types every day we can use them if there are 4 dishes available
            if len(dish_names_price) == 4:
                dish_types = [
                    "Veggie", "Traditionelle Küche", "Internationale Küche",
                    "Specials"
                ]
            else:
                dish_types = ["Tagesgericht"] * len(dish_names_price)

            # create ingredients
            # all dishes have the same ingridients
            ingredients = Ingredients("ipp-bistro")
            ingredients.parse_ingredients("Mi,Gl,Sf,Sl,Ei,Se,4")
            # create list of Dish objects
            counter = 0
            dishes = []
            for (dish_name, price) in dish_names_price:
                dishes.append(
                    Dish(dish_name.strip(),
                         Prices(Price(price.replace(',', '.').strip())),
                         ingredients.ingredient_set, dish_types[counter]))
                counter += 1
            date = self.get_date(year, week_number,
                                 self.weekday_positions[key])
            # create new Menu object and add it to dict
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus
Exemple #4
0
    def get_menus(self, text, year, week_number):
        menus = {}
        lines = text.splitlines()
        count = 0
        # remove headline etc.
        for line in lines:
            if line.replace(" ", "").replace(
                    "\n",
                    "").lower() == "montagdienstagmittwochdonnerstagfreitag":
                break

            count += 1

        lines = lines[count:]
        # we assume that the weeksdays are now all in the first line
        pos_mon = lines[0].find("Montag")
        pos_tue = lines[0].find("Dienstag")
        pos_wed = lines[0].find("Mittwoch")
        pos_thu = lines[0].find("Donnerstag")
        pos_fri = lines[0].find("Freitag")

        # The text is formatted as table using whitespaces. Hence, we need to get those parts of each line that refer
        #  to the respective week day
        lines_weekdays = {
            "mon": "",
            "tue": "",
            "wed": "",
            "thu": "",
            "fri": ""
        }
        for line in lines:
            lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace(
                "\n", " ").replace("Montag", "")
            lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace(
                "\n", " ").replace("Dienstag", "")
            lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace(
                "\n", " ").replace("Mittwoch", "")
            lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace(
                "\n", " ").replace("Donnerstag", "")
            lines_weekdays["fri"] += " " + line[pos_fri:].replace(
                "\n", " ").replace("Freitag", "")

        # currently, up to 5 dishes are on the menu
        num_dishes = 5
        line_aktion = []
        if year < 2018:
            # in older versions of the FMI Bistro menu, the Aktionsgericht was the same for the whole week
            num_dishes = 3
            line_aktion = [s for s in lines if "Aktion" in s]
            if len(line_aktion) == 1:
                line_aktion_pos = lines.index(line_aktion[0]) - 2
                aktionsgericht = ' '.join(
                    lines[line_aktion_pos:line_aktion_pos + 3])
                aktionsgericht = aktionsgericht \
                    .replace("Montag – Freitag", "") \
                    .replace("Tagessuppe täglich wechselndes Angebot", "") \
                    .replace("ab € 1,00", "") \
                    .replace("Aktion", "")
                num_dishes += aktionsgericht.count('€')
                for key in lines_weekdays:
                    lines_weekdays[
                        key] = aktionsgericht + ", " + lines_weekdays[key]

        # Process menus for each day
        for key in lines_weekdays:
            # stop parsing day when bistro is closed at that day
            if "geschlossen" in lines_weekdays[key].lower():
                continue

            # extract all allergens
            dish_allergens = []
            for x in re.findall(self.allergens_regex, lines_weekdays[key]):
                if len(x) > 0:
                    dish_allergens.append(
                        re.sub(r"((Allergene:)|\s|\n)*", "", x[0]))
                else:
                    dish_allergens.append("")
            lines_weekdays[key] = re.sub(self.allergens_regex, "",
                                         lines_weekdays[key])
            # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT)
            lines_weekdays[key] = unicodedata.normalize(
                "NFKC", lines_weekdays[key])
            # remove multi-whitespaces
            lines_weekdays[key] = ' '.join(lines_weekdays[key].split())

            # remove no allergens indicator
            lines_weekdays[key] = lines_weekdays[key].replace("./.", "")
            # get all dish including name and price
            dish_names = re.findall(self.dish_regex, lines_weekdays[key])
            # get dish prices
            prices = re.findall(self.price_regex, ' '.join(dish_names))
            # convert prices to float
            prices = [
                Prices(
                    Price(
                        float(
                            price.replace("€", "").replace(",", ".").strip())))
                for price in prices
            ]
            # remove price and commas from dish names
            dish_names = [
                re.sub(self.price_regex, "", dish).replace(",", "").strip()
                for dish in dish_names
            ]
            # create list of Dish objects; only take first 3/4 as the following dishes are corrupt and not necessary
            dishes = []
            for (dish_name, price,
                 dish_allergen) in list(zip(dish_names, prices,
                                            dish_allergens)):
                # filter empty dishes
                if dish_name:
                    ingredients = Ingredients("fmi-bistro")
                    ingredients.parse_ingredients(dish_allergen)
                    dishes.append(
                        Dish(dish_name, price, ingredients.ingredient_set,
                             "Tagesgericht"))
            dishes = dishes[:num_dishes]
            date = self.get_date(year, week_number,
                                 self.weekday_positions[key])
            # create new Menu object and add it to dict
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus