def get_recipe_urls(self): recipe_urls = [] list_urls = [ "https://aseasyasapplepie.com/tag/cumin/", "https://aseasyasapplepie.com/recipe-index/", ] for url in list_urls: content = get_cached(url) soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') anchors = soup.find_all("a", href=True) for a in anchors: href = a["href"] if href in recipe_urls: continue if not href.startswith("https://aseasyasapplepie.com"): continue if href.startswith("https://aseasyasapplepie.com/tag") or href.startswith("https://aseasyasapplepie.com/category"): if href not in list_urls: list_urls.append(href) continue page_content = get_cached(href) page_soup = BeautifulSoup(page_content.decode("utf-8"), "html.parser") special_link = page_soup.findAll("div", {"class": "wprm-recipe-snippets"}) if len(special_link) > 0: recipe_urls.append(href) yield href return []
def get_recipe_urls(self): visited_list_urls = {} recipe_urls = {} i = 0 while len(self.to_visit_list_urls) > 0: list_url = self.to_visit_list_urls[0] self.to_visit_list_urls = self.to_visit_list_urls[1:] try: content = get_cached(list_url) except: content = get_cached(list_url.split("?")[0]) soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') anchors = soup.find_all("a", href=True) for a in anchors: href = a["href"] if not href.startswith("http"): continue if "/recipe/" in href and not recipe_urls.get(href, False): recipe_urls[href] = True yield href if "/recipes/" in href and not visited_list_urls.get(href, False): visited_list_urls[href] = True self.to_visit_list_urls.append(href) i += 1
def get_recipe(self, url): content = get_cached(url) if content is None: return None soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') script = soup.find("script", {"type": "application/ld+json"}) recipe_json = script.contents[0] try: recipe_json = json.loads(recipe_json) except: return None try: title = recipe_json["name"] image_urls = [recipe_json["image"]] except: return None ingredient_strings = [] for i in recipe_json["recipeIngredient"]: ingredient = clean_str(i) ingredient_strings.append(ingredient) ingredients = { "ingredients": ingredient_strings, } i = 0 steps = {} for section in recipe_json["recipeInstructions"]: s = section["text"] steps[f"steps {i}"] = s.split("\xa0") i += 1 tags = [word.lower() for word in title.split(" ")] servings = recipe_json.get("recipeYield", 1) recipe = Recipe( url=url, title=title, subtitle="", ingredients=ingredients, instructions=steps, servings=servings, tags=clean_tags(tags), images=image_urls, ) return recipe
def get_recipe(self, url): content = get_cached(url) soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') def _get_span(classname): span = li.find("span", {"class": classname}) if span is None: return "" return span.text div = soup.find("div", {"class": "wprm-recipe-name wprm-color-header"}) title = div.text ingredients = {"ingredients": []} lis = soup.findAll("li", {"class": "wprm-recipe-ingredient"}) for li in lis: amount = _get_span("wprm-recipe-ingredient-amount") unit = _get_span("wprm-recipe-ingredient-unit") name = _get_span("wprm-recipe-ingredient-name") notes = _get_span("wprm-recipe-ingredient-notes") ingredient = f"{amount} {unit} {name}({notes})" ingredients["ingredients"].append(ingredient) instructions = {"instructions": []} divs = soup.findAll("div", {"class": "wprm-recipe-instruction-text"}) for div in divs: instructions["instructions"].append(div.text) tags = [] tagsDiv = soup.find("div", {"class": "meta-bottom"}) for a in tagsDiv.findAll("a", {"rel": "tag"}): tags.append(a.text) img_tags = soup.findAll("img", {"class": "size-full"}) images = [img.attrs["src"] for img in img_tags if img.attrs["src"].startswith("https://aseasyasapplepie.com")] return Recipe( url=url, title=title, subtitle="", servings=0, ingredients=ingredients, instructions=instructions, tags=tags, images=images, )
def get_recipe_urls(self): recipe_urls = {} i = 1 while True: list_url = f"https://www.bonappetit.com/search/?page={i}" content = get_cached(list_url) soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') anchors = soup.find_all("a", href=True) for a in anchors: href = a["href"] if href.startswith("/"): href = f"https://www.bonappetit.com{href}" if "/recipe/" in href and not recipe_urls.get(href, False): recipe_urls[href] = True yield href i += 1
def get_recipe_urls(self): recipe_urls = {} i = 1 while True: list_url = f"https://www.epicurious.com/search?page={i}" content = get_cached(list_url) if "DON'T CRY!" in str(content): break soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') anchors = soup.find_all("a", href=True) for a in anchors: href = a["href"] if href.startswith("/"): href = f"https://www.epicurious.com{href}" if "/recipes/food/views/" in href and not recipe_urls.get( href, False): recipe_urls[href] = True yield href i += 1
def get_recipe(self, url): original_url = url content = get_cached(url) content = content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') title = soup.find("h1", {"class": "headline"}) if title is None: title = soup.find("h1", {"id": "recipe-main-content"}) if title is None: return None title = str(title.text) ingredients = {"ingredients": []} lis = soup.findAll("li", {"class": "ingredients-item"}) if len(lis) == 0: lis = soup.findAll("span", {"itemprop": "recipeIngredient"}) for li in lis: s = clean_str(li.text) ingredients["ingredients"].append(s) steps = {"steps": []} divs = soup.findAll("div", {"class": "paragraph"}) if len(divs) == 0: divs = soup.findAll("span", {"class": "recipe-directions__list--item"}) for div in divs: step = clean_str(div.text) steps["steps"].append(step) servings = None divs = soup.findAll("div", {"class": "recipe-meta-item"}) for div in divs: key = clean_str(div.find("div", {"class": "recipe-meta-item-header"})) val = clean_str(div.find("div", {"class": "recipe-meta-item-body"})) if key.lower() == "yield:": servings = clean_str(val.text) break images = [] div = soup.find("div", {"class": "hero-photo__wrap"}) if div is not None: a = div.find("a") imgs_content = get_cached(a.attrs["href"]) imgs_content = imgs_content.decode('utf-8') for line in imgs_content.split("\n"): if "urls:" in line: url = line.split("'")[1] images.append(url) divs = soup.findAll("div", {"class": "lazy-image"}) for div in divs: btn = div.find("button") if btn is not None: url = btn.attrs["data-image"] url = url.split("url=")[1] url = urllib.parse.unquote(url) images.append(url) recipe_div = soup.find("div", {"class": "recipe-content-container"}) if recipe_div is not None: img = recipe_div.find("img") if img is not None: url = img.attrs["src"] try: url = url.split("url=")[1] url = url.split("?")[0] url = urllib.parse.unquote(url) except: pass images.append(url) images = [img for img in images if "media-allrecipes.com" in img] spans = soup.findAll("span", {"class": "breadcrumbs__title"}) category = [span.text.strip("\n ") for span in spans] category = category[2:] return Recipe( url=original_url, title=title, subtitle="", ingredients=ingredients, instructions=steps, servings=servings, tags=clean_tags(category), images=images, )
def get_recipe(self, url): content = get_cached(url) if content is None: return None soup = BeautifulSoup(content.decode('utf-8'), 'html.parser') title = soup.find("h1", {"itemprop": "name"}) try: title = str(title.text) except: title = url.split("/")[-1].replace("-", " ") ingredients = {} ingredient_groups = soup.findAll("ol", {"class": "ingredient-groups"}) for group in ingredient_groups: group_title = soup.find("strong").text group_ingredients = [] lis = group.findAll("li", {"class": "ingredient"}) for li in lis: s = clean_str(li.text) group_ingredients.append(s) ingredients[group_title] = group_ingredients steps = {} preparation_groups = soup.findAll("ol", {"class": "preparation-groups"}) for group in preparation_groups: group_title = soup.find("strong").text group_steps = [] lis = group.findAll("li", {"class": "preparation-step"}) for li in lis: step = clean_str(li.text) group_steps.append(step) steps[group_title] = group_steps tags = [] dt = soup.find("dl", {"class": "tags"}) if dt is not None: dts = dt.findAll("dt") tags = [str(dt.text) for dt in dts] servings = 1 dd = soup.find("dd", {"class": "yield"}) if dd is not None: servings = dd.text try: div = soup.find("div", {"class": "recipe-image"}) img = div.find("meta", {"itemprop": "image"}) image_url = img.attrs['content'] image_urls = [image_url] except: image_urls = [] recipe = Recipe( url=url, title=title, subtitle="", ingredients=ingredients, instructions=steps, servings=servings, tags=clean_tags(tags), images=image_urls, ) return recipe