def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.image_urls.append( urllib.parse.urljoin( recipe.detail_url, detail_soup.find("div", "photo").img["data-src"])) material_title_node = detail_soup.find("div", "material") material_title = material_title_node.h4.text.replace( "材料", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if material_title: recipe.materials.append(RecipeText("{}".format(material_title))) for material in material_title_node.find_all("li"): texts = [m.text for m in material.find_all("span")] if "".join([t.strip() for t in texts]) == "": continue recipe.materials.append(RecipeText(": ".join(texts))) recipe_steps_title_node = detail_soup.find("div", "make") for i, recipe_step in enumerate( recipe_steps_title_node.find_all("li")): for j, l in enumerate(recipe_step.text.splitlines()): if j == 0: recipe.recipe_steps.append( RecipeText("({}){}".format(i + 1, l))) continue recipe.recipe_steps.append(RecipeText(l)) yield recipe
def _get_recipe_overviews(self, overview_soup, entry_url): recipe_title_node = overview_soup.find("h2", text=re.compile(r"レシピ.*")) if recipe_title_node is None: logger.info("{} have no recipe.".format(entry_url)) return dict() recipe_root_node = recipe_title_node.parent recipes = dict() # key: Recipe.id, value: Recipe for ii, recipe_node in enumerate( [h3.parent for h3 in recipe_root_node.find_all("h3")]): recipe = Recipe() recipe.program_date = dateutil.parser.parse("20{}".format( pathlib.Path(entry_url).stem)) recipe.program_name = self.program_name recipe.detail_url = entry_url recipe.cooking_name = recipe_node.h3.text recipe.image_urls.append( urllib.parse.urljoin( entry_url, re.search("background-image:url\((.*?)\);", recipe_node.img["style"]).group(1))) is_material_area = False is_recipe_step_area = False for l in recipe_node.find_all("p")[1].text.splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("【材料】"): if is_recipe_step_area == False: is_material_area = True l = l.replace("【材料】", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(l): recipe.materials.append(RecipeText(l)) continue if -1 < l.find("【作り方】"): is_material_area = False is_recipe_step_area = True continue if is_material_area: material = l.replace(":", ": ") recipe.materials.append(RecipeText(material)) elif is_recipe_step_area: recpe_step_text = l m = re.match("^(\d+)(.*)", l) if m: num, recipe_t = m.groups() recpe_step_text = "({}){}".format( num, recipe_t.strip()) recipe.recipe_steps.append(RecipeText(recpe_step_text)) recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, ii) recipes[recipe.id] = recipe return recipes
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) material_title_node = detail_soup.select_one("#zairyou_box") recipe_steps_title_node = detail_soup.find("table", "recipe") material_title = material_title_node.p.text.replace( "材料", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if material_title: recipe.materials.append(RecipeText(material_title)) recipe.materials.extend([ RecipeText(": ".join( [mm.text for mm in m.find_all("td") if len(mm.text.strip())])) for m in material_title_node.find_all("tr") ]) for recipe_step in recipe_steps_title_node.find_all("tr"): num, text, point = recipe_step.find_all("td") recipe.recipe_steps.append( RecipeText("({}){}".format(num.text.strip(), text.text.strip()))) if len(point.text.strip()): recipe.recipe_steps.append(RecipeText(point.text.strip())) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.cooking_name_sub = detail_soup.find( "td", "tema").text if detail_soup.find("td", "tema") else None recipe.image_urls.append( urllib.parse.urljoin( recipe.detail_url, detail_soup.select_one('img[src$="jpg"]')["src"])) recipe_steps_title_node, material_title_node = detail_soup.find_all( "table", "text2") material_title = "({})".format(detail_soup.find("td", "making").text) if material_title: recipe.materials.append(RecipeText(material_title)) recipe.materials.extend([ RecipeText(tr.text.strip().replace("\n", ": ")) for tr in material_title_node.find_all("tr") ]) recipe.recipe_steps = [ RecipeText("({}){}".format(i + 1, tr.text.strip())) for i, tr in enumerate(recipe_steps_title_node.find_all("tr")) ] yield recipe
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe current_subtitle = None current_recipe_important_points = list() for item in overview_soup.find_all("section")[1:]: if item.h1: continue subtitle_node = item.find("h2", "option-sub-title") if subtitle_node and subtitle_node.find_next_sibling("p") is None: # current_subtitle = subtitle_node.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() current_recipe_important_points.clear() continue if item.h2: title_node = item recipe = Recipe() recipe.detail_url = entry_url recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() recipe.cooking_name_sub = current_subtitle recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse("{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups())) recipe.image_urls.append(urllib.parse.urljoin(entry_url, title_node.img["src"])) is_material_area = False is_recipe_step_area = False for l in title_node.find("div", "option-media-row").get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("<材料>"): is_material_area = True recipe.materials.append(RecipeText(l.replace("<材料>", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO))) continue if -1 < l.find("<作り方>"): is_material_area = False is_recipe_step_area = True continue if is_material_area: recipe.materials.extend([RecipeText(m.replace(":", ": ")) for m in l.split()]) elif is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l)) if not recipe.program_date < datetime.datetime.now(): logger.debug("{} is invalid date".format(recipe.program_date)) continue recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, hashlib.md5(("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest()) recipes[recipe.id] = recipe return recipes
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ def get_cooking_string(target_cooking_name_node, cooking_name_nodes): ret = [] for sibling in target_cooking_name_node.next_siblings: if sibling in cooking_name_nodes: break if isinstance(sibling, bs4.NavigableString): ret.append(sibling) else: ret.append(sibling.text) return "\n".join([l for l in ret if len(l.strip())]) cooking_name_nodes = detail_soup.find_all("h4") for cooking_name_node in cooking_name_nodes: recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = cooking_name_node.text.strip() recipe.image_urls = [urllib.parse.urljoin(recipe.detail_url, node["src"]) for node in cooking_name_node.parent.parent.select('img[src$="jpg"]')] cooking_string = get_cooking_string(cooking_name_node, cooking_name_nodes) is_material_area = False is_recipe_step_area = False for l in cooking_string.splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("材料"): if is_recipe_step_area == False: is_material_area = True continue if -1 < l.find("作り方"): is_material_area = False is_recipe_step_area = True continue if is_material_area: if l.startswith("・"): l = l[1:] recipe.materials.append(RecipeText(l.replace(":", ": "))) elif is_recipe_step_area: m = re.match(r"(\d+)[).](.*)", l) if m: l = "({}){}".format(*m.groups()) recipe.recipe_steps.append(RecipeText(l)) if len(recipe.materials) + len(recipe.recipe_steps): yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ for recipe_area_node in detail_soup.find_all("section", "recipe_area"): if recipe_area_node.h4 is None: continue recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = "/".join( [t.text.strip() for t in recipe_area_node.find_all("h4")]) pic_sub = recipe_area_node.find("div", "pic_sub") if pic_sub: for class_v in pic_sub["class"]: if class_v.lower().startswith("photo"): image_url = urllib.parse.urljoin( recipe.detail_url, "../img/recipe/{}/{}.jpg".format( recipe.id, class_v)) recipe.image_urls.append(image_url) material_title_node = recipe_area_node.find("div", "material_box") material_title = material_title_node.find("span", "people") if material_title: material_title = material_title.text.translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() recipe.materials.append(RecipeText(material_title)) for tr in material_title_node.find_all("tr"): recipe.materials.append( RecipeText(": ".join( [td.text.strip() for td in tr.find_all("td")]))) recipe_title_node = recipe_area_node.find("div", "recipe_main_box") for i, recipe_step in enumerate( recipe_title_node.find_all("span", "recipe_text")): recipe_step_str = recipe_step.text.strip() if len(recipe_step_str): recipe.recipe_steps.append( RecipeText("({}){}".format(i + 1, recipe_step_str))) point_title_node = recipe_area_node.find("div", "point_box_wide") if point_title_node: recipe.important_points.extend([ RecipeText(p) for p in point_title_node.find( "span", "point").text.strip().splitlines() ]) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ h6s = detail_soup.find_all("h6") # h6s = detail_soup.select("h5,h6") # 2020.01.05 アンパドラット threshold_len = int(len(h6s) / 2) material_title_nodes = h6s[0:threshold_len] recipe_steps_title_nodes = h6s[threshold_len:] for i, (material_title_node, recipe_steps_title_node) in enumerate( zip(material_title_nodes, recipe_steps_title_nodes)): recipe = copy.deepcopy(overview_recipe) recipe.image_urls.append( urllib.parse.urljoin( recipe.detail_url, detail_soup.find("div", "common_contents_box_mini").img["src"])) material_title = material_title_node.text.replace("材料", "").strip() if material_title: if i: recipe.cooking_name = "%s / %s" % (recipe.cooking_name, material_title) recipe.materials.append(RecipeText(material_title)) for material in material_title_node.find_next_sibling( "ul").find_all("li"): recipe.materials.append( RecipeText(": ".join( [m.text for m in material.find_all("span")]))) for j, recipe_step in enumerate( recipe_steps_title_node.find_next_sibling("ul").find_all( "li")): recipe.recipe_steps.append( RecipeText("({}){}".format(j + 1, recipe_step.text.strip()))) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = detail_soup.strong.text if detail_soup.strong else detail_soup.find_all( "b")[1].text recipe.image_urls.append( urllib.parse.urljoin( recipe.detail_url, detail_soup.select_one('img[src$="jpg"]')["src"])) material_title = "({})".format( detail_soup.find("td", align="right").b.text.strip()) if material_title: recipe.materials.append(RecipeText(material_title)) for material in detail_soup.find("div", "zairyo").text.strip().splitlines(): if -1 < material.find("監修"): break if len(material): recipe.materials.append(RecipeText(material.replace("…", ": "))) for recipe_step in detail_soup.find_all("table")[-2].find_all( "td")[1].text.strip().splitlines(): recipe_step = recipe_step.strip() if -1 < recipe_step.find("監修"): break if len(recipe_step): recipe.recipe_steps.append(RecipeText(recipe_step)) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = detail_soup.find("p", "detail-title-name").text.strip() recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(recipe.id.split("_")[0]) recipe.image_urls.append(detail_soup.find("meta", attrs=dict(property="og:image"))["content"]) title_nodes = detail_soup.find_all("h2") material_title_node = title_nodes[0] advice_title_node = title_nodes[-5] material_title = material_title_node.text.replace("材料", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if material_title: recipe.materials.append(RecipeText(material_title)) recipe.materials.extend([RecipeText(": ".join(li.text.split())) for li in material_title_node.parent.parent.select("h4,li")]) for i, howto_item in enumerate(detail_soup.find_all("div", "howto-item")): if i: recipe.recipe_steps.append(RecipeText("")) # 空行 if howto_item.find("div", "howto-child") is not None: # https://www.ntv.co.jp/3min/recipe/20200704/ for recipe_item in howto_item.find_all("li"): for j, recipe_step in enumerate(recipe_item.find_all("div", "howto-group-inner")): buf = "" if j: num, step = re.search(r"【(\d+)】(.*)", recipe_step.text.strip()).groups() num = num.strip() if len(num): buf += "({})".format(num) buf += step.strip() else: buf = recipe_step.text.strip() image_urls = [] for img in recipe_step.find_all("img"): image_urls.append(img["src"]) recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls)) for j, howto_memo_item in enumerate(recipe_item.find_all("div", "howto-memo-item")): if j: recipe.recipe_steps.append(RecipeText("")) # 空行 buf = "(メモ)" + howto_memo_item.text.strip() image_urls = [] for img in howto_memo_item.find_all("img"): # "data:" for https://www.ntv.co.jp/3min/recipe/20201024/ if ("class" in img) and img["class"] != "howto-memo-icon" and not img["src"].startswith("data:"): image_urls.append(img["src"]) recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls)) else: # for recipe_step in recipe_steps_title_node.parent.parent.find_all("li"): for recipe_step in howto_item.find_all("li"): buf = "" if i: buf = recipe_step.text.strip() else: ps = recipe_step.find_all("p") if len(ps) == 2: num, step = ps num = num.text.strip() if len(num): buf += "({})".format(num) else: # https://www.ntv.co.jp/3min/recipe/20200812/ :no num parts step = ps[0] buf += step.text.strip() image_urls = [] for img in recipe_step.find_all("img"): # "data:" for https://www.ntv.co.jp/3min/recipe/20201024/ if not img["src"].startswith("data:"): image_urls.append(img["src"]) recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls)) for i, points_item in enumerate(detail_soup.find_all("div", "points-item")): if i: recipe.recipe_steps.append(RecipeText("")) # 空行 buf = "(ポイント)" + points_item.text.strip() image_urls = [] for img in points_item.find_all("img"): if ("class" in img) and img["class"] != "points-icon": image_urls.append(img["src"]) recipe.recipe_steps.append(RecipeText(buf, image_urls=image_urls)) for advice in advice_title_node.parent.parent.find_all("li"): recipe.important_points.append(RecipeText(advice.text.strip())) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = detail_soup.h3.text recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(recipe.id.split("_")[0]) recipe.image_urls.append( urllib.parse.urljoin(recipe.detail_url, detail_soup.select_one("#thumbnail")["src"])) material_title_node = detail_soup.find("div", "ingredient") recipe_steps_title_node = detail_soup.find("div", "howto") material_title = material_title_node.h4.text.replace( "材料", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if material_title: recipe.materials.append(RecipeText(material_title)) for material in material_title_node.find_all("tr"): recipe.materials.append( RecipeText(": ".join([m.text for m in material.find_all("td")]))) for recipe_step in recipe_steps_title_node.find_all("tr"): num, step = recipe_step.find_all("td") if step.li is None: buf = "" num = num.text.strip() if len(num): buf += "({})".format(num) buf += step.text.strip() image_urls = None if step.img: image_urls = [ urllib.parse.urljoin(recipe.detail_url, step.img["src"]) ] recipe.recipe_steps.append( RecipeText(buf, image_urls=image_urls)) else: # No.20190824 # exists sub steps. recipe.recipe_steps.append(RecipeText( step.next)) # line.1 is title in sub steps for sub_index, step_li in enumerate(step.find_all("li")): image_urls = None if step_li.img: image_urls = [ urllib.parse.urljoin(recipe.detail_url, step_li.img["src"]) ] recipe.recipe_steps.append( RecipeText("({}){}".format(sub_index + 1, step_li.text), image_urls=image_urls)) for appendix in detail_soup.find_all("div", "recipe-box"): for i, l in enumerate([ t.strip() for t in appendix.get_text("\n").splitlines() if len(t.strip()) ]): if l.startswith("・"): l = l[1:].strip() if i: l = " {}".format(l) recipe.important_points.append(RecipeText(l)) yield recipe
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe items = overview_soup.select("section,hr")[1:-1] subtitle_node = None title_node = None title_node_counter = 0 for item in items: if item.name == "hr": subtitle_node = None title_node = None title_node_counter = 0 continue if subtitle_node is None: subtitle_node = item continue else: title_node = item title_node_counter += 1 recipe = Recipe() recipe.detail_url = entry_url recipe.cooking_name = ( title_node.h2 if title_node.h2 else title_node.p).text.translate( self.__class__._TABLE_REMOVE_KAKKO).strip() # 2020.01.10 # recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() recipe.cooking_name_sub = subtitle_node.h2.text.strip() recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse( "{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups())) if title_node.img: recipe.image_urls.append( urllib.parse.urljoin(entry_url, title_node.img["src"])) is_material_area = False is_recipe_step_area = False for l in title_node.find( "div", "option-media-row").get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("【材料】"): if is_recipe_step_area == False: is_material_area = True l = l.replace("【材料】", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(l): recipe.materials.append(RecipeText(l)) continue if -1 < l.find("【作り方】"): is_material_area = False is_recipe_step_area = True continue if is_material_area: materials = [ m.replace("… ", "…").replace("…", ": ") for m in l.split("\n") if len(m.strip()) ] materials = [ m[1:] if m.startswith("・") else m for m in materials ] recipe.materials.extend([RecipeText(m) for m in materials]) elif is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l)) if not recipe.program_date < datetime.datetime.now(): logger.debug("{} is invalid date".format(recipe.program_date)) continue # recipe.id = hashlib.md5("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name).encode("utf-8")).hexdigest() recipe.id = "{:%Y%m%d}".format(recipe.program_date) if 1 < title_node_counter: recipe.id += "_{}".format(title_node_counter) recipes[recipe.id] = recipe return recipes
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = "".join(detail_soup.h2.text.split()) # No.153 is invalid title ex. "(太陽みたいなでっか~い)\tアンパン" recipe.image_urls.append(urllib.parse.urljoin(recipe.detail_url, "".join(detail_soup.find("p", "plat").img["src"].split()))) # No.135 is invalid: 'https://www.nhk.or.jp/kamado/images/135\n\n/recipe_plat.jpg' if detail_soup.find("div", "sozai_inner"): # exist materials part material_title_node = detail_soup.find("div", "sozai_inner").table for material in material_title_node.find_all("tr"): recipe.materials.append(RecipeText(": ".join(material.text.strip().split()))) kimete = detail_soup.find("div", "kimete") if kimete: if kimete.h4: recipe.important_points.append(RecipeText(kimete.h4.text)) kimete_l = kimete.select_one("div.kimete_l,div.kimete_inner2") if kimete_l: for p in kimete_l.find_all("p"): recipe.important_points.append(RecipeText(": ".join([c.text if hasattr(c, "text") else c for c in p.contents]))) recipe_prepare_node = detail_soup.find("table", "prepare") if recipe_prepare_node: recipe.recipe_steps.append(RecipeText("準備")) recipe_prepare_l = recipe_prepare_node.find("p", "txt") if recipe_prepare_l is None: ps = recipe_prepare_node.find_all("p") if ps: recipe_prepare_l = ps[-1] else: recipe_prepare_l = detail_soup.dl # example: id=04 if recipe_prepare_l: for c in recipe_prepare_l.contents: tmp = c if hasattr(c, "text"): tmp = c.text tmp = tmp.strip() if len(tmp): recipe.recipe_steps.append(RecipeText(tmp)) else: logger.debug("no prepare: {}".format(recipe.id)) for step_table in detail_soup.find_all("table", "step"): # No.92 has multiple table(include invalid format) # exist recipe steps part recipe_steps_title_node = step_table if recipe_steps_title_node.tbody: recipe_steps_title_node = recipe_steps_title_node.tbody # No.92 has no tbody element for recipe_step in recipe_steps_title_node.find_all("tr", recursive=False): for td in recipe_step.find_all("td", recursive=False): image_urls = [urllib.parse.urljoin(recipe.detail_url, img["src"]) for img in td.select('img[src$="jpg"]')] text = "" if td.img: # img_alt = td.img["alt"] # 02 is invalid step number. img_src = td.img["src"] m = re.search(r".*step(\d+)\.png", img_src) if m: num = m.group(1) num = int(num) text += "({})".format(num) else: text += td.img["alt"] if len(td.text.strip()): text += td.text.strip() if len(text): image_urls = ["".join(image_url.split()) for image_url in image_urls] # No.120 is invalid "../images/120\n/recipe_process03.jpg" recipe.recipe_steps.append(RecipeText(text, image_urls=image_urls)) yield recipe
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ def convert_material(material_s): m = re.match(r"(.*)[\((](.*)[\))]", material_s) if m: return ": ".join(m.groups()) return material_s recipe_title_node = detail_soup.find("div", "headline", text=re.compile(r".*レシピ.*")) if recipe_title_node is None: return subtitle = None recipe = None is_recipe_step_area = False for line_ in recipe_title_node.find_next_sibling("ul").ul.text.replace( " ", " ").splitlines(): line_ = line_.strip() if len(line_) == 0: continue m_subtitle = re.match(r".*?軒目\s*「(.*)」", line_) if m_subtitle: if recipe: yield recipe subtitle = m_subtitle.group(1) recipe = None is_recipe_step_area = False continue m_title = re.match(r"^料理.*?[①-⑩][::]?(.*)", line_) if m_title: if recipe: yield recipe is_recipe_step_area = False title = m_title.group(1) recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = title recipe.cooking_name_sub = "{}/{}".format( recipe.cooking_name_sub, subtitle) continue m_material = re.match(r"材料\s*(.*)", line_) if m_material: material = m_material.group(1) recipe.materials.extend([ RecipeText(convert_material(material_s)) for material_s in material.split("、") ]) # material area is 1 line. continue m_recipe_step = re.match(r"作り方\s*(.*)", line_) if m_recipe_step: is_recipe_step_area = True recipe.recipe_steps.append( RecipeText(m_recipe_step.group(1).strip())) continue if is_recipe_step_area: recipe.recipe_steps.append(RecipeText(line_.strip())) continue if recipe: yield recipe
def _recipe_details_generator(self, converted_content, overview_recipe): """ must deepcopy "recipe" before use """ def get_cooking_shop_strings(lines): ret = [] buf = None is_recipe_step_area = False for l in lines: if re.search("軒目", l.strip()) or re.match( r"^[①-⑳*].*『.*』", l.strip()) or re.match( r"^[①-⑳*].*「.*」", l.strip()): if buf: ret.append(buf) buf = l.strip() continue if re.search("^(料理|万能調味料)", l.strip()): is_recipe_step_area = False if re.search("^材料", l.strip()): title, materials = re.search("(材料)(.*)", l.strip()).groups() # buf += "\n" + "\n".join(l.strip().split(None, 1)) buf += "\n" + title + "\n" + materials.strip() continue if re.search("^作り方", l.strip()): is_recipe_step_area = True title, recipe_steps = re.search("(作り方)(.*)", l.strip()).groups() # buf += "\n" + "\n".join(l.strip().split(None, 1)) buf += "\n" + title + "\n" + recipe_steps.strip() continue if buf: if is_recipe_step_area: if re.match(r"^[①-⑳*]", l.strip()): buf += "\n" + l.strip() else: buf += l.strip() else: buf += "\n" + l.strip() if buf: ret.append(buf) return ret for ii, l in enumerate(converted_content.splitlines()): if ii == 1: overview_recipe.cooking_name_sub = l.strip() continue if -1 < l.find("初回放送"): overview_recipe.program_date = dateutil.parser.parse("/".join( re.search(r"(\d+)\D+(\d+)\D+(\d+)\D+", l).groups())) break cooking_shop_strings = get_cooking_shop_strings( converted_content.splitlines()) logger.debug("-" * 20) logger.debug(cooking_shop_strings) for shop_string in cooking_shop_strings: recipe_shop = None recipe = None is_material_area = False is_recipe_step_area = False for l in shop_string.splitlines(): if len(l.strip()) == 0: continue if is_material_area == False and is_recipe_step_area == False: if re.search("軒目", l.strip()) or re.match( r"^[①-⑳*].*『.*』", l.strip()) or re.match( r"^[①-⑳*].*「.*」", l.strip()): recipe_shop = copy.deepcopy(overview_recipe) recipe = None m = re.search(r"「(.*)」", l) if m: recipe_shop.cooking_name_sub += "/" + m.group(1) else: m2 = re.search(r"『(.*)』", l) if m2: recipe_shop.cooking_name_sub += "/" + m2.group( 1) continue if re.search("^(料理|万能調味料)", l.strip()): is_material_area = False is_recipe_step_area = False if recipe: yield recipe if recipe_shop: recipe = copy.deepcopy(recipe_shop) else: recipe = copy.deepcopy(overview_recipe) if -1 < l.find(":"): recipe.cooking_name = l.split(":")[1].strip() elif -1 < l.find(":"): recipe.cooking_name = l.split(":")[1].strip() elif re.search(r"^(料理|万能調味料)[①-⑳]", l.strip()): # https://www.nhk.or.jp/program/manpuku/recipe/dg0_200115.pdf # 料理①カルパッチョ recipe.cooking_name = l.strip()[3:].strip() else: recipe.cooking_name = l.split(None, 1)[1].strip() continue if re.search("^材料", l.strip()): is_material_area = True is_recipe_step_area = False if l.strip() == "材料": continue if re.search("^作り方", l.strip()): is_material_area = False is_recipe_step_area = True if l.strip() == "作り方": pass else: l = l.replace("作り方", "", 1) # recipeがNoneの場合はエラーとして検出したい recipe.recipe_steps.append(RecipeText(l.strip())) continue if is_material_area: for material in l.strip().split("、"): material = material.strip() if len(material): if material.startswith("("): recipe.materials.append(RecipeText(material)) else: recipe.materials.append( RecipeText( material.replace("(", ": ").replace( ")", ""))) if is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l.strip())) if recipe: yield recipe
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe current_subtitle = None current_recipe_important_points = list() for item in overview_soup.find_all("section")[1:]: if item.table: continue if item.h1: continue if item.h2: current_subtitle = item.h2.text.translate( self.__class__._TABLE_REMOVE_KAKKO).strip() current_recipe_important_points.clear() continue if item.p is None: continue recipe = Recipe() recipe.detail_url = entry_url recipe.program_name = self.program_name recipe.program_date = None if item.img is None: for l in item.p.get_text("\n").splitlines(): current_recipe_important_points.append(RecipeText(l)) continue if item.h3: # multiple recipe recipe.cooking_name = item.h3.text recipe.cooking_name_sub = current_subtitle else: # single recipe recipe.cooking_name = current_subtitle recipe.important_points.extend(current_recipe_important_points) recipe.image_urls.append( urllib.parse.urljoin(entry_url, item.img["src"])) is_material_area = False is_recipe_step_area = False # for l in item.find("div", "option-media-row").get_text("\n").splitlines(): for l in item.p.get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("◎材料"): is_material_area = True material_title = l.replace("◎材料", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(material_title): recipe.materials.append(RecipeText(material_title)) continue if -1 < l.find("<作り方>"): is_material_area = False is_recipe_step_area = True continue if is_material_area: l = l.replace(" 本", "本").replace(" 個", "個") recipe.materials.extend( [RecipeText(m.replace(":", ": ")) for m in l.split()]) elif is_recipe_step_area: m = re.match(r"(\d+).\s*(.*)", l) if m: gs = m.groups() num = int(gs[0]) recipe_step = gs[1] recipe.recipe_steps.append( RecipeText("({}){}".format(num, recipe_step))) else: recipe.recipe_steps.append(RecipeText(l)) recipe.id = hashlib.md5( ("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest() recipes[recipe.id] = recipe return recipes
def _recipe_details_generator(self, detail_soup, overview_recipe): """ must deepcopy "recipe" before use """ def get_recipe_areas(lines): recipe_areas = list() # 1 recipe area in kobara sukimashita ka recipe_areas.append(lines) return recipe_areas for recipe_title_node in detail_soup.find_all( "h1", text=re.compile(r"「.*」")): for recipe_area in get_recipe_areas( recipe_title_node.parent.parent.find( "ul", "answers").text.splitlines()): recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = recipe_title_node.text.translate( self.__class__._TABLE_REMOVE_KAKKO).strip() is_material_area = False is_recipe_step_area = False for l in recipe_area: if len(l.strip()) == 0: continue if -1 < l.find("■材料"): is_material_area = True recipe.materials.append( RecipeText( l.replace("■材料", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO))) continue if -1 < l.find("■作り方"): is_material_area = False is_recipe_step_area = True continue if is_material_area: recipe.materials.extend([ RecipeText(m.replace(":", ": ")) for m in l.split() ]) elif is_recipe_step_area: recipe.recipe_steps.append( RecipeText(l.replace("\t", " "))) yield recipe for recipe_title_node in detail_soup.find_all( "span", text=re.compile(r".*レシピ")): for recipe_area in get_recipe_areas( recipe_title_node.parent.find_next_sibling( "ul").text.splitlines()): recipe = copy.deepcopy(overview_recipe) recipe.cooking_name = recipe_title_node.text.replace( "レシピ", "").strip() is_material_area = False is_recipe_step_area = False for l in recipe_area: if len(l.strip()) == 0: continue if -1 < l.find("◎材料"): is_material_area = True recipe.materials.append( RecipeText( l.replace("◎材料", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO))) continue if -1 < l.find("◎作り方"): is_material_area = False is_recipe_step_area = True continue if is_material_area: recipe.materials.extend([ RecipeText(m.replace(":", ": ")) for m in l.split() ]) elif is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l)) yield recipe