コード例 #1
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)

        recipe.image_urls.append(
            urllib.parse.urljoin(
                recipe.detail_url,
                detail_soup.find("div", "photo").img["data-src"]))

        material_title_node = detail_soup.find("div", "material")
        material_title = material_title_node.h4.text.replace(
            "材料",
            "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
        if material_title:
            recipe.materials.append(RecipeText("{}".format(material_title)))
        for material in material_title_node.find_all("li"):
            texts = [m.text for m in material.find_all("span")]
            if "".join([t.strip() for t in texts]) == "":
                continue
            recipe.materials.append(RecipeText(": ".join(texts)))

        recipe_steps_title_node = detail_soup.find("div", "make")

        for i, recipe_step in enumerate(
                recipe_steps_title_node.find_all("li")):
            for j, l in enumerate(recipe_step.text.splitlines()):
                if j == 0:
                    recipe.recipe_steps.append(
                        RecipeText("({}){}".format(i + 1, l)))
                    continue
                recipe.recipe_steps.append(RecipeText(l))

        yield recipe
コード例 #2
0
ファイル: ktv_niji.py プロジェクト: yukinext/tools-python
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipe_title_node = overview_soup.find("h2", text=re.compile(r"レシピ.*"))
        if recipe_title_node is None:
            logger.info("{} have no recipe.".format(entry_url))
            return dict()

        recipe_root_node = recipe_title_node.parent

        recipes = dict()  # key: Recipe.id, value: Recipe
        for ii, recipe_node in enumerate(
            [h3.parent for h3 in recipe_root_node.find_all("h3")]):
            recipe = Recipe()

            recipe.program_date = dateutil.parser.parse("20{}".format(
                pathlib.Path(entry_url).stem))
            recipe.program_name = self.program_name
            recipe.detail_url = entry_url
            recipe.cooking_name = recipe_node.h3.text
            recipe.image_urls.append(
                urllib.parse.urljoin(
                    entry_url,
                    re.search("background-image:url\((.*?)\);",
                              recipe_node.img["style"]).group(1)))

            is_material_area = False
            is_recipe_step_area = False
            for l in recipe_node.find_all("p")[1].text.splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("【材料】"):
                    if is_recipe_step_area == False:
                        is_material_area = True
                        l = l.replace("【材料】", "").translate(
                            self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                        if len(l):
                            recipe.materials.append(RecipeText(l))
                        continue
                if -1 < l.find("【作り方】"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    material = l.replace(":", ": ")
                    recipe.materials.append(RecipeText(material))
                elif is_recipe_step_area:
                    recpe_step_text = l
                    m = re.match("^(\d+)(.*)", l)
                    if m:
                        num, recipe_t = m.groups()
                        recpe_step_text = "({}){}".format(
                            num, recipe_t.strip())
                    recipe.recipe_steps.append(RecipeText(recpe_step_text))

            recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, ii)

            recipes[recipe.id] = recipe

        return recipes
コード例 #3
0
ファイル: oishimeshi.py プロジェクト: yukinext/tools-python
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)

        material_title_node = detail_soup.select_one("#zairyou_box")
        recipe_steps_title_node = detail_soup.find("table", "recipe")
        material_title = material_title_node.p.text.replace(
            "材料",
            "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
        if material_title:
            recipe.materials.append(RecipeText(material_title))
        recipe.materials.extend([
            RecipeText(": ".join(
                [mm.text for mm in m.find_all("td") if len(mm.text.strip())]))
            for m in material_title_node.find_all("tr")
        ])

        for recipe_step in recipe_steps_title_node.find_all("tr"):
            num, text, point = recipe_step.find_all("td")
            recipe.recipe_steps.append(
                RecipeText("({}){}".format(num.text.strip(),
                                           text.text.strip())))
            if len(point.text.strip()):
                recipe.recipe_steps.append(RecipeText(point.text.strip()))

        yield recipe
コード例 #4
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)

        recipe.cooking_name_sub = detail_soup.find(
            "td", "tema").text if detail_soup.find("td", "tema") else None
        recipe.image_urls.append(
            urllib.parse.urljoin(
                recipe.detail_url,
                detail_soup.select_one('img[src$="jpg"]')["src"]))

        recipe_steps_title_node, material_title_node = detail_soup.find_all(
            "table", "text2")
        material_title = "({})".format(detail_soup.find("td", "making").text)
        if material_title:
            recipe.materials.append(RecipeText(material_title))
        recipe.materials.extend([
            RecipeText(tr.text.strip().replace("\n", ": "))
            for tr in material_title_node.find_all("tr")
        ])

        recipe.recipe_steps = [
            RecipeText("({}){}".format(i + 1, tr.text.strip()))
            for i, tr in enumerate(recipe_steps_title_node.find_all("tr"))
        ]

        yield recipe
コード例 #5
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict() # key: Recipe.id, value: Recipe

        current_subtitle = None
        current_recipe_important_points = list()
        for item in overview_soup.find_all("section")[1:]:
            if item.h1:
                continue
            
            subtitle_node = item.find("h2", "option-sub-title")
            if subtitle_node and subtitle_node.find_next_sibling("p") is None: # 
                current_subtitle = subtitle_node.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
                current_recipe_important_points.clear()
                continue            
            
            if item.h2:
                title_node = item
                
                recipe = Recipe()
                recipe.detail_url = entry_url
                recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
                recipe.cooking_name_sub = current_subtitle
                recipe.program_name = self.program_name
                recipe.program_date = dateutil.parser.parse("{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups()))
                recipe.image_urls.append(urllib.parse.urljoin(entry_url, title_node.img["src"]))
            
                is_material_area = False
                is_recipe_step_area = False
                for l in title_node.find("div", "option-media-row").get_text("\n").splitlines():
                    if len(l.strip()) == 0:
                        continue
                    
                    if -1 < l.find("<材料>"):
                        is_material_area = True
                        recipe.materials.append(RecipeText(l.replace("<材料>", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO)))
                        continue
                    if -1 < l.find("<作り方>"):
                        is_material_area = False
                        is_recipe_step_area = True
                        continue
                    
                    if is_material_area:
                        recipe.materials.extend([RecipeText(m.replace(":", ": ")) for m in l.split()])
                    elif is_recipe_step_area:
                        recipe.recipe_steps.append(RecipeText(l))
                        
                if not recipe.program_date < datetime.datetime.now():
                    logger.debug("{} is invalid date".format(recipe.program_date))
                    continue

                recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, hashlib.md5(("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest())
                recipes[recipe.id] = recipe

        return recipes
コード例 #6
0
ファイル: nhk_umai.py プロジェクト: yukinext/tools-python
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        def get_cooking_string(target_cooking_name_node, cooking_name_nodes):
            ret = []
            for sibling in target_cooking_name_node.next_siblings:
                if sibling in cooking_name_nodes:
                    break
                if isinstance(sibling, bs4.NavigableString):
                    ret.append(sibling)
                else:
                    ret.append(sibling.text)
            return "\n".join([l for l in ret if len(l.strip())])
        
        cooking_name_nodes = detail_soup.find_all("h4")
        for cooking_name_node in cooking_name_nodes:
            recipe = copy.deepcopy(overview_recipe)
            recipe.cooking_name = cooking_name_node.text.strip()
            recipe.image_urls = [urllib.parse.urljoin(recipe.detail_url, node["src"]) for node in cooking_name_node.parent.parent.select('img[src$="jpg"]')]
            
            cooking_string = get_cooking_string(cooking_name_node, cooking_name_nodes)
            
            is_material_area = False
            is_recipe_step_area = False
            for l in cooking_string.splitlines():
                if len(l.strip()) == 0:
                    continue
                
                if -1 < l.find("材料"):
                    if is_recipe_step_area == False:
                        is_material_area = True
                        continue
                if -1 < l.find("作り方"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue
                
                if is_material_area:
                    if l.startswith("・"):
                        l = l[1:]
                    recipe.materials.append(RecipeText(l.replace(":", ": ")))
                elif is_recipe_step_area:
                    m = re.match(r"(\d+)[).](.*)", l)
                    if m:
                        l = "({}){}".format(*m.groups())
                    recipe.recipe_steps.append(RecipeText(l))

            if len(recipe.materials) + len(recipe.recipe_steps):
                yield recipe
コード例 #7
0
ファイル: tbs_obigohan.py プロジェクト: yukinext/tools-python
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        for recipe_area_node in detail_soup.find_all("section", "recipe_area"):
            if recipe_area_node.h4 is None:
                continue

            recipe = copy.deepcopy(overview_recipe)
            recipe.cooking_name = "/".join(
                [t.text.strip() for t in recipe_area_node.find_all("h4")])
            pic_sub = recipe_area_node.find("div", "pic_sub")
            if pic_sub:
                for class_v in pic_sub["class"]:
                    if class_v.lower().startswith("photo"):
                        image_url = urllib.parse.urljoin(
                            recipe.detail_url,
                            "../img/recipe/{}/{}.jpg".format(
                                recipe.id, class_v))
                        recipe.image_urls.append(image_url)

            material_title_node = recipe_area_node.find("div", "material_box")
            material_title = material_title_node.find("span", "people")
            if material_title:
                material_title = material_title.text.translate(
                    self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                recipe.materials.append(RecipeText(material_title))

            for tr in material_title_node.find_all("tr"):
                recipe.materials.append(
                    RecipeText(": ".join(
                        [td.text.strip() for td in tr.find_all("td")])))

            recipe_title_node = recipe_area_node.find("div", "recipe_main_box")
            for i, recipe_step in enumerate(
                    recipe_title_node.find_all("span", "recipe_text")):
                recipe_step_str = recipe_step.text.strip()
                if len(recipe_step_str):
                    recipe.recipe_steps.append(
                        RecipeText("({}){}".format(i + 1, recipe_step_str)))

            point_title_node = recipe_area_node.find("div", "point_box_wide")
            if point_title_node:
                recipe.important_points.extend([
                    RecipeText(p) for p in point_title_node.find(
                        "span", "point").text.strip().splitlines()
                ])

            yield recipe
コード例 #8
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        h6s = detail_soup.find_all("h6")
        # h6s = detail_soup.select("h5,h6") # 2020.01.05 アンパドラット
        threshold_len = int(len(h6s) / 2)
        material_title_nodes = h6s[0:threshold_len]
        recipe_steps_title_nodes = h6s[threshold_len:]

        for i, (material_title_node, recipe_steps_title_node) in enumerate(
                zip(material_title_nodes, recipe_steps_title_nodes)):
            recipe = copy.deepcopy(overview_recipe)

            recipe.image_urls.append(
                urllib.parse.urljoin(
                    recipe.detail_url,
                    detail_soup.find("div",
                                     "common_contents_box_mini").img["src"]))

            material_title = material_title_node.text.replace("材料", "").strip()
            if material_title:
                if i:
                    recipe.cooking_name = "%s / %s" % (recipe.cooking_name,
                                                       material_title)
                recipe.materials.append(RecipeText(material_title))
            for material in material_title_node.find_next_sibling(
                    "ul").find_all("li"):
                recipe.materials.append(
                    RecipeText(": ".join(
                        [m.text for m in material.find_all("span")])))

            for j, recipe_step in enumerate(
                    recipe_steps_title_node.find_next_sibling("ul").find_all(
                        "li")):
                recipe.recipe_steps.append(
                    RecipeText("({}){}".format(j + 1,
                                               recipe_step.text.strip())))

            yield recipe
コード例 #9
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)
        recipe.cooking_name = detail_soup.strong.text if detail_soup.strong else detail_soup.find_all(
            "b")[1].text

        recipe.image_urls.append(
            urllib.parse.urljoin(
                recipe.detail_url,
                detail_soup.select_one('img[src$="jpg"]')["src"]))

        material_title = "({})".format(
            detail_soup.find("td", align="right").b.text.strip())
        if material_title:
            recipe.materials.append(RecipeText(material_title))

        for material in detail_soup.find("div",
                                         "zairyo").text.strip().splitlines():
            if -1 < material.find("監修"):
                break
            if len(material):
                recipe.materials.append(RecipeText(material.replace("…",
                                                                    ": ")))

        for recipe_step in detail_soup.find_all("table")[-2].find_all(
                "td")[1].text.strip().splitlines():
            recipe_step = recipe_step.strip()
            if -1 < recipe_step.find("監修"):
                break

            if len(recipe_step):
                recipe.recipe_steps.append(RecipeText(recipe_step))

        yield recipe
コード例 #10
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)
        recipe.cooking_name = detail_soup.find("p", "detail-title-name").text.strip()
        recipe.program_name = self.program_name
        recipe.program_date = dateutil.parser.parse(recipe.id.split("_")[0])

        recipe.image_urls.append(detail_soup.find("meta", attrs=dict(property="og:image"))["content"])
        title_nodes = detail_soup.find_all("h2")

        material_title_node = title_nodes[0]
        advice_title_node = title_nodes[-5]
        
        material_title = material_title_node.text.replace("材料", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
        if material_title:
            recipe.materials.append(RecipeText(material_title))
        
        recipe.materials.extend([RecipeText(": ".join(li.text.split())) for li in material_title_node.parent.parent.select("h4,li")])

        for i, howto_item in enumerate(detail_soup.find_all("div", "howto-item")):
            if i:
                recipe.recipe_steps.append(RecipeText("")) # 空行
            if howto_item.find("div", "howto-child") is not None:
                # https://www.ntv.co.jp/3min/recipe/20200704/
                for recipe_item in howto_item.find_all("li"):
                    for j, recipe_step in enumerate(recipe_item.find_all("div", "howto-group-inner")):
                        buf = ""
                        if j:
                            num, step = re.search(r"【(\d+)】(.*)", recipe_step.text.strip()).groups()
                            num = num.strip()
                            if len(num):
                                buf += "({})".format(num)
                            buf += step.strip()
                        else:
                            buf = recipe_step.text.strip()
                            
                        image_urls = []
                        for img in recipe_step.find_all("img"):
                            image_urls.append(img["src"])
                        
                        recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls))
                                        
                    for j, howto_memo_item in enumerate(recipe_item.find_all("div", "howto-memo-item")):
                        if j:
                            recipe.recipe_steps.append(RecipeText("")) # 空行
                        buf = "(メモ)" + howto_memo_item.text.strip()
                        
                        image_urls = []
                        for img in howto_memo_item.find_all("img"):
                            # "data:" for https://www.ntv.co.jp/3min/recipe/20201024/
                            if ("class" in img) and img["class"] != "howto-memo-icon" and not img["src"].startswith("data:"):
                                image_urls.append(img["src"])
                        
                        recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls))
            else:
                # for recipe_step in recipe_steps_title_node.parent.parent.find_all("li"):
                for recipe_step in howto_item.find_all("li"):
                    buf = ""
                    if i:
                        buf = recipe_step.text.strip()
                    else:
                        ps = recipe_step.find_all("p")
                        if len(ps) == 2:
                            num, step = ps
                            num = num.text.strip()
                            if len(num):
                                buf += "({})".format(num)
                        else:
                            # https://www.ntv.co.jp/3min/recipe/20200812/ :no num parts
                            step = ps[0]
                        buf += step.text.strip()
                    
                    image_urls = []
                    for img in recipe_step.find_all("img"):
                        # "data:" for https://www.ntv.co.jp/3min/recipe/20201024/
                        if not img["src"].startswith("data:"):
                            image_urls.append(img["src"])
                    
                    recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls))

        for i, points_item in enumerate(detail_soup.find_all("div", "points-item")):
            if i:
                recipe.recipe_steps.append(RecipeText("")) # 空行
            buf = "(ポイント)" + points_item.text.strip()
            
            image_urls = []
            for img in points_item.find_all("img"):
                if ("class" in img) and img["class"] != "points-icon":
                    image_urls.append(img["src"])
            
            recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls))

        
        for advice in advice_title_node.parent.parent.find_all("li"):
            recipe.important_points.append(RecipeText(advice.text.strip()))
        
        yield recipe
コード例 #11
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)
        recipe.cooking_name = detail_soup.h3.text
        recipe.program_name = self.program_name
        recipe.program_date = dateutil.parser.parse(recipe.id.split("_")[0])

        recipe.image_urls.append(
            urllib.parse.urljoin(recipe.detail_url,
                                 detail_soup.select_one("#thumbnail")["src"]))

        material_title_node = detail_soup.find("div", "ingredient")
        recipe_steps_title_node = detail_soup.find("div", "howto")

        material_title = material_title_node.h4.text.replace(
            "材料",
            "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
        if material_title:
            recipe.materials.append(RecipeText(material_title))
        for material in material_title_node.find_all("tr"):
            recipe.materials.append(
                RecipeText(": ".join([m.text
                                      for m in material.find_all("td")])))

        for recipe_step in recipe_steps_title_node.find_all("tr"):
            num, step = recipe_step.find_all("td")
            if step.li is None:
                buf = ""
                num = num.text.strip()
                if len(num):
                    buf += "({})".format(num)
                buf += step.text.strip()

                image_urls = None
                if step.img:
                    image_urls = [
                        urllib.parse.urljoin(recipe.detail_url,
                                             step.img["src"])
                    ]

                recipe.recipe_steps.append(
                    RecipeText(buf, image_urls=image_urls))
            else:  # No.20190824
                # exists sub steps.
                recipe.recipe_steps.append(RecipeText(
                    step.next))  # line.1 is title in sub steps
                for sub_index, step_li in enumerate(step.find_all("li")):
                    image_urls = None
                    if step_li.img:
                        image_urls = [
                            urllib.parse.urljoin(recipe.detail_url,
                                                 step_li.img["src"])
                        ]
                    recipe.recipe_steps.append(
                        RecipeText("({}){}".format(sub_index + 1,
                                                   step_li.text),
                                   image_urls=image_urls))

        for appendix in detail_soup.find_all("div", "recipe-box"):
            for i, l in enumerate([
                    t.strip() for t in appendix.get_text("\n").splitlines()
                    if len(t.strip())
            ]):
                if l.startswith("・"):
                    l = l[1:].strip()

                if i:
                    l = " {}".format(l)

                recipe.important_points.append(RecipeText(l))

        yield recipe
コード例 #12
0
ファイル: nhk_kiichi.py プロジェクト: yukinext/tools-python
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        items = overview_soup.select("section,hr")[1:-1]

        subtitle_node = None
        title_node = None
        title_node_counter = 0
        for item in items:
            if item.name == "hr":
                subtitle_node = None
                title_node = None
                title_node_counter = 0
                continue
            if subtitle_node is None:
                subtitle_node = item
                continue
            else:
                title_node = item
                title_node_counter += 1

            recipe = Recipe()
            recipe.detail_url = entry_url
            recipe.cooking_name = (
                title_node.h2
                if title_node.h2 else title_node.p).text.translate(
                    self.__class__._TABLE_REMOVE_KAKKO).strip()  # 2020.01.10
            # recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
            recipe.cooking_name_sub = subtitle_node.h2.text.strip()
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(
                "{}/{}".format(*re.search("(\d+)\D+(\d+)\D+",
                                          recipe.cooking_name_sub).groups()))
            if title_node.img:
                recipe.image_urls.append(
                    urllib.parse.urljoin(entry_url, title_node.img["src"]))

            is_material_area = False
            is_recipe_step_area = False
            for l in title_node.find(
                    "div", "option-media-row").get_text("\n").splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("【材料】"):
                    if is_recipe_step_area == False:
                        is_material_area = True
                        l = l.replace("【材料】", "").translate(
                            self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                        if len(l):
                            recipe.materials.append(RecipeText(l))
                        continue
                if -1 < l.find("【作り方】"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    materials = [
                        m.replace("… ", "…").replace("…", ": ")
                        for m in l.split("\n") if len(m.strip())
                    ]
                    materials = [
                        m[1:] if m.startswith("・") else m for m in materials
                    ]
                    recipe.materials.extend([RecipeText(m) for m in materials])
                elif is_recipe_step_area:
                    recipe.recipe_steps.append(RecipeText(l))

            if not recipe.program_date < datetime.datetime.now():
                logger.debug("{} is invalid date".format(recipe.program_date))
                continue

            # recipe.id = hashlib.md5("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name).encode("utf-8")).hexdigest()
            recipe.id = "{:%Y%m%d}".format(recipe.program_date)
            if 1 < title_node_counter:
                recipe.id += "_{}".format(title_node_counter)

            recipes[recipe.id] = recipe

        return recipes
コード例 #13
0
ファイル: nhk_kamado.py プロジェクト: yukinext/tools-python
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        recipe = copy.deepcopy(overview_recipe)

        recipe.cooking_name = "".join(detail_soup.h2.text.split()) # No.153 is invalid title ex. "(太陽みたいなでっか~い)\tアンパン"
        recipe.image_urls.append(urllib.parse.urljoin(recipe.detail_url, "".join(detail_soup.find("p", "plat").img["src"].split()))) # No.135 is invalid: 'https://www.nhk.or.jp/kamado/images/135\n\n/recipe_plat.jpg'

        if detail_soup.find("div", "sozai_inner"):
            # exist materials part
            material_title_node = detail_soup.find("div", "sozai_inner").table
            for material in material_title_node.find_all("tr"):
                recipe.materials.append(RecipeText(": ".join(material.text.strip().split())))
    
            kimete = detail_soup.find("div", "kimete")
            if kimete:
                if kimete.h4:
                    recipe.important_points.append(RecipeText(kimete.h4.text))
                kimete_l = kimete.select_one("div.kimete_l,div.kimete_inner2")
                if kimete_l:
                    for p in kimete_l.find_all("p"):
                        recipe.important_points.append(RecipeText(": ".join([c.text if hasattr(c, "text") else c for c in p.contents])))
            recipe_prepare_node = detail_soup.find("table", "prepare")
            if recipe_prepare_node:
                recipe.recipe_steps.append(RecipeText("準備"))
                recipe_prepare_l = recipe_prepare_node.find("p", "txt")
                if recipe_prepare_l is None:
                    ps = recipe_prepare_node.find_all("p")
                    if ps:
                        recipe_prepare_l = ps[-1]
                    else:
                        recipe_prepare_l = detail_soup.dl # example: id=04
                if recipe_prepare_l:
                    for c in recipe_prepare_l.contents:
                        tmp = c
                        if hasattr(c, "text"):
                            tmp = c.text
                        tmp = tmp.strip()
                        if len(tmp):
                            recipe.recipe_steps.append(RecipeText(tmp))
                else:
                    logger.debug("no prepare: {}".format(recipe.id))
        
        for step_table in detail_soup.find_all("table", "step"): # No.92 has multiple table(include invalid format)
            # exist recipe steps part
            recipe_steps_title_node = step_table
            if recipe_steps_title_node.tbody:
                recipe_steps_title_node = recipe_steps_title_node.tbody # No.92 has no tbody element
                
            for recipe_step in recipe_steps_title_node.find_all("tr", recursive=False):
                for td in recipe_step.find_all("td", recursive=False):
                    image_urls = [urllib.parse.urljoin(recipe.detail_url, img["src"]) for img in td.select('img[src$="jpg"]')]
                    
                    text = ""
                    if td.img:
                        # img_alt = td.img["alt"] # 02 is invalid step number.
                        img_src = td.img["src"]
                        m = re.search(r".*step(\d+)\.png", img_src)
                        if m:
                            num = m.group(1)
                            num = int(num)
                            text += "({})".format(num)
                        else:
                            text += td.img["alt"]
    
                    if len(td.text.strip()):
                        text += td.text.strip()
                    
                    if len(text):
                        image_urls = ["".join(image_url.split()) for image_url in image_urls] # No.120 is invalid "../images/120\n/recipe_process03.jpg"
                        recipe.recipe_steps.append(RecipeText(text, image_urls=image_urls))
            
        yield recipe
コード例 #14
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        def convert_material(material_s):
            m = re.match(r"(.*)[\((](.*)[\))]", material_s)
            if m:
                return ": ".join(m.groups())
            return material_s

        recipe_title_node = detail_soup.find("div",
                                             "headline",
                                             text=re.compile(r".*レシピ.*"))
        if recipe_title_node is None:
            return

        subtitle = None
        recipe = None
        is_recipe_step_area = False
        for line_ in recipe_title_node.find_next_sibling("ul").ul.text.replace(
                " ", " ").splitlines():
            line_ = line_.strip()
            if len(line_) == 0:
                continue

            m_subtitle = re.match(r".*?軒目\s*「(.*)」", line_)
            if m_subtitle:
                if recipe:
                    yield recipe
                subtitle = m_subtitle.group(1)
                recipe = None
                is_recipe_step_area = False
                continue

            m_title = re.match(r"^料理.*?[①-⑩][::]?(.*)", line_)
            if m_title:
                if recipe:
                    yield recipe
                is_recipe_step_area = False
                title = m_title.group(1)
                recipe = copy.deepcopy(overview_recipe)
                recipe.cooking_name = title
                recipe.cooking_name_sub = "{}/{}".format(
                    recipe.cooking_name_sub, subtitle)
                continue

            m_material = re.match(r"材料\s*(.*)", line_)
            if m_material:
                material = m_material.group(1)
                recipe.materials.extend([
                    RecipeText(convert_material(material_s))
                    for material_s in material.split("、")
                ])
                # material area is 1 line.
                continue

            m_recipe_step = re.match(r"作り方\s*(.*)", line_)
            if m_recipe_step:
                is_recipe_step_area = True
                recipe.recipe_steps.append(
                    RecipeText(m_recipe_step.group(1).strip()))
                continue

            if is_recipe_step_area:
                recipe.recipe_steps.append(RecipeText(line_.strip()))
                continue

        if recipe:
            yield recipe
コード例 #15
0
    def _recipe_details_generator(self, converted_content, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        def get_cooking_shop_strings(lines):
            ret = []
            buf = None
            is_recipe_step_area = False
            for l in lines:
                if re.search("軒目", l.strip()) or re.match(
                        r"^[①-⑳*].*『.*』", l.strip()) or re.match(
                            r"^[①-⑳*].*「.*」", l.strip()):
                    if buf:
                        ret.append(buf)
                    buf = l.strip()
                    continue

                if re.search("^(料理|万能調味料)", l.strip()):
                    is_recipe_step_area = False

                if re.search("^材料", l.strip()):
                    title, materials = re.search("(材料)(.*)",
                                                 l.strip()).groups()
                    # buf += "\n" + "\n".join(l.strip().split(None, 1))
                    buf += "\n" + title + "\n" + materials.strip()
                    continue

                if re.search("^作り方", l.strip()):
                    is_recipe_step_area = True
                    title, recipe_steps = re.search("(作り方)(.*)",
                                                    l.strip()).groups()
                    # buf += "\n" + "\n".join(l.strip().split(None, 1))
                    buf += "\n" + title + "\n" + recipe_steps.strip()
                    continue

                if buf:
                    if is_recipe_step_area:
                        if re.match(r"^[①-⑳*]", l.strip()):
                            buf += "\n" + l.strip()
                        else:
                            buf += l.strip()
                    else:
                        buf += "\n" + l.strip()
            if buf:
                ret.append(buf)

            return ret

        for ii, l in enumerate(converted_content.splitlines()):
            if ii == 1:
                overview_recipe.cooking_name_sub = l.strip()
                continue

            if -1 < l.find("初回放送"):
                overview_recipe.program_date = dateutil.parser.parse("/".join(
                    re.search(r"(\d+)\D+(\d+)\D+(\d+)\D+", l).groups()))
                break

        cooking_shop_strings = get_cooking_shop_strings(
            converted_content.splitlines())

        logger.debug("-" * 20)
        logger.debug(cooking_shop_strings)
        for shop_string in cooking_shop_strings:
            recipe_shop = None
            recipe = None
            is_material_area = False
            is_recipe_step_area = False
            for l in shop_string.splitlines():
                if len(l.strip()) == 0:
                    continue

                if is_material_area == False and is_recipe_step_area == False:
                    if re.search("軒目", l.strip()) or re.match(
                            r"^[①-⑳*].*『.*』", l.strip()) or re.match(
                                r"^[①-⑳*].*「.*」", l.strip()):
                        recipe_shop = copy.deepcopy(overview_recipe)
                        recipe = None

                        m = re.search(r"「(.*)」", l)
                        if m:
                            recipe_shop.cooking_name_sub += "/" + m.group(1)
                        else:
                            m2 = re.search(r"『(.*)』", l)
                            if m2:
                                recipe_shop.cooking_name_sub += "/" + m2.group(
                                    1)

                        continue

                if re.search("^(料理|万能調味料)", l.strip()):
                    is_material_area = False
                    is_recipe_step_area = False
                    if recipe:
                        yield recipe

                    if recipe_shop:
                        recipe = copy.deepcopy(recipe_shop)
                    else:
                        recipe = copy.deepcopy(overview_recipe)

                    if -1 < l.find(":"):
                        recipe.cooking_name = l.split(":")[1].strip()
                    elif -1 < l.find(":"):
                        recipe.cooking_name = l.split(":")[1].strip()
                    elif re.search(r"^(料理|万能調味料)[①-⑳]", l.strip()):
                        # https://www.nhk.or.jp/program/manpuku/recipe/dg0_200115.pdf
                        # 料理①カルパッチョ
                        recipe.cooking_name = l.strip()[3:].strip()
                    else:
                        recipe.cooking_name = l.split(None, 1)[1].strip()
                    continue

                if re.search("^材料", l.strip()):
                    is_material_area = True
                    is_recipe_step_area = False
                    if l.strip() == "材料":
                        continue

                if re.search("^作り方", l.strip()):
                    is_material_area = False
                    is_recipe_step_area = True
                    if l.strip() == "作り方":
                        pass
                    else:
                        l = l.replace("作り方", "", 1)
                        # recipeがNoneの場合はエラーとして検出したい
                        recipe.recipe_steps.append(RecipeText(l.strip()))
                    continue

                if is_material_area:
                    for material in l.strip().split("、"):
                        material = material.strip()
                        if len(material):
                            if material.startswith("("):
                                recipe.materials.append(RecipeText(material))
                            else:
                                recipe.materials.append(
                                    RecipeText(
                                        material.replace("(", ": ").replace(
                                            ")", "")))

                if is_recipe_step_area:
                    recipe.recipe_steps.append(RecipeText(l.strip()))
            if recipe:
                yield recipe
コード例 #16
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe

        current_subtitle = None
        current_recipe_important_points = list()
        for item in overview_soup.find_all("section")[1:]:
            if item.table:
                continue

            if item.h1:
                continue

            if item.h2:
                current_subtitle = item.h2.text.translate(
                    self.__class__._TABLE_REMOVE_KAKKO).strip()
                current_recipe_important_points.clear()
                continue

            if item.p is None:
                continue

            recipe = Recipe()
            recipe.detail_url = entry_url
            recipe.program_name = self.program_name
            recipe.program_date = None

            if item.img is None:
                for l in item.p.get_text("\n").splitlines():
                    current_recipe_important_points.append(RecipeText(l))
                continue

            if item.h3:
                # multiple recipe
                recipe.cooking_name = item.h3.text
                recipe.cooking_name_sub = current_subtitle
            else:
                # single recipe
                recipe.cooking_name = current_subtitle

            recipe.important_points.extend(current_recipe_important_points)
            recipe.image_urls.append(
                urllib.parse.urljoin(entry_url, item.img["src"]))

            is_material_area = False
            is_recipe_step_area = False
            # for l in item.find("div", "option-media-row").get_text("\n").splitlines():
            for l in item.p.get_text("\n").splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("◎材料"):
                    is_material_area = True
                    material_title = l.replace("◎材料", "").translate(
                        self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                    if len(material_title):
                        recipe.materials.append(RecipeText(material_title))
                    continue
                if -1 < l.find("<作り方>"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    l = l.replace(" 本", "本").replace(" 個", "個")
                    recipe.materials.extend(
                        [RecipeText(m.replace(":", ": ")) for m in l.split()])
                elif is_recipe_step_area:
                    m = re.match(r"(\d+).\s*(.*)", l)
                    if m:
                        gs = m.groups()
                        num = int(gs[0])
                        recipe_step = gs[1]
                        recipe.recipe_steps.append(
                            RecipeText("({}){}".format(num, recipe_step)))
                    else:
                        recipe.recipe_steps.append(RecipeText(l))

            recipe.id = hashlib.md5(
                ("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name)
                 if recipe.cooking_name_sub else
                 recipe.cooking_name).encode("utf-8")).hexdigest()
            recipes[recipe.id] = recipe

        return recipes
コード例 #17
0
    def _recipe_details_generator(self, detail_soup, overview_recipe):
        """
        must deepcopy "recipe" before use
        """
        def get_recipe_areas(lines):
            recipe_areas = list()
            # 1 recipe area in kobara sukimashita ka
            recipe_areas.append(lines)
            return recipe_areas

        for recipe_title_node in detail_soup.find_all(
                "h1", text=re.compile(r"「.*」")):
            for recipe_area in get_recipe_areas(
                    recipe_title_node.parent.parent.find(
                        "ul", "answers").text.splitlines()):
                recipe = copy.deepcopy(overview_recipe)
                recipe.cooking_name = recipe_title_node.text.translate(
                    self.__class__._TABLE_REMOVE_KAKKO).strip()

                is_material_area = False
                is_recipe_step_area = False
                for l in recipe_area:
                    if len(l.strip()) == 0:
                        continue

                    if -1 < l.find("■材料"):
                        is_material_area = True
                        recipe.materials.append(
                            RecipeText(
                                l.replace("■材料", "").translate(
                                    self.__class__._TABLE_REPLACE_MARUKAKKO)))
                        continue
                    if -1 < l.find("■作り方"):
                        is_material_area = False
                        is_recipe_step_area = True
                        continue

                    if is_material_area:
                        recipe.materials.extend([
                            RecipeText(m.replace(":", ": "))
                            for m in l.split()
                        ])
                    elif is_recipe_step_area:
                        recipe.recipe_steps.append(
                            RecipeText(l.replace("\t", " ")))

                yield recipe

        for recipe_title_node in detail_soup.find_all(
                "span", text=re.compile(r".*レシピ")):
            for recipe_area in get_recipe_areas(
                    recipe_title_node.parent.find_next_sibling(
                        "ul").text.splitlines()):
                recipe = copy.deepcopy(overview_recipe)
                recipe.cooking_name = recipe_title_node.text.replace(
                    "レシピ", "").strip()
                is_material_area = False
                is_recipe_step_area = False
                for l in recipe_area:
                    if len(l.strip()) == 0:
                        continue

                    if -1 < l.find("◎材料"):
                        is_material_area = True
                        recipe.materials.append(
                            RecipeText(
                                l.replace("◎材料", "").translate(
                                    self.__class__._TABLE_REPLACE_MARUKAKKO)))
                        continue
                    if -1 < l.find("◎作り方"):
                        is_material_area = False
                        is_recipe_step_area = True
                        continue

                    if is_material_area:
                        recipe.materials.extend([
                            RecipeText(m.replace(":", ": "))
                            for m in l.split()
                        ])
                    elif is_recipe_step_area:
                        recipe.recipe_steps.append(RecipeText(l))

                yield recipe