Exemple #1
0
    def get_recipe_urls(self):
        recipe_urls = []
        list_urls = [
            "https://aseasyasapplepie.com/tag/cumin/",
            "https://aseasyasapplepie.com/recipe-index/",
        ]
        for url in list_urls:
            content = get_cached(url)
            soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

            anchors = soup.find_all("a", href=True)
            for a in anchors:
                href = a["href"]

                if href in recipe_urls:
                    continue

                if not href.startswith("https://aseasyasapplepie.com"):
                    continue

                if href.startswith("https://aseasyasapplepie.com/tag") or href.startswith("https://aseasyasapplepie.com/category"):
                    if href not in list_urls:
                        list_urls.append(href)
                    continue

                page_content = get_cached(href)
                page_soup = BeautifulSoup(page_content.decode("utf-8"), "html.parser")
                special_link = page_soup.findAll("div", {"class": "wprm-recipe-snippets"})
                if len(special_link) > 0:
                    recipe_urls.append(href)
                    yield href

        return []
Exemple #2
0
    def get_recipe_urls(self):
        visited_list_urls = {}
        recipe_urls = {}
        i = 0
        while len(self.to_visit_list_urls) > 0:
            list_url = self.to_visit_list_urls[0]
            self.to_visit_list_urls = self.to_visit_list_urls[1:]

            try:
                content = get_cached(list_url)
            except:
                content = get_cached(list_url.split("?")[0])

            soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

            anchors = soup.find_all("a", href=True)
            for a in anchors:
                href = a["href"]
                if not href.startswith("http"):
                    continue
                if "/recipe/" in href and not recipe_urls.get(href, False):
                    recipe_urls[href] = True
                    yield href
                if "/recipes/" in href and not visited_list_urls.get(href, False):
                    visited_list_urls[href] = True
                    self.to_visit_list_urls.append(href)
            i += 1
Exemple #3
0
    def get_recipe(self, url):
        content = get_cached(url)
        if content is None:
            return None
        soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')
        script = soup.find("script", {"type": "application/ld+json"})
        recipe_json = script.contents[0]
        try:
            recipe_json = json.loads(recipe_json)
        except:
            return None

        try:
            title = recipe_json["name"]
            image_urls = [recipe_json["image"]]
        except:
            return None

        ingredient_strings = []
        for i in recipe_json["recipeIngredient"]:
            ingredient = clean_str(i)
            ingredient_strings.append(ingredient)
        ingredients = {
            "ingredients": ingredient_strings,
        }

        i = 0
        steps = {}
        for section in recipe_json["recipeInstructions"]:
            s = section["text"]
            steps[f"steps {i}"] = s.split("\xa0")
            i += 1

        tags = [word.lower() for word in title.split(" ")]
        servings = recipe_json.get("recipeYield", 1)

        recipe = Recipe(
            url=url,
            title=title,
            subtitle="",
            ingredients=ingredients,
            instructions=steps,
            servings=servings,
            tags=clean_tags(tags),
            images=image_urls,
        )
        return recipe
Exemple #4
0
    def get_recipe(self, url):
        content = get_cached(url)
        soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

        def _get_span(classname):
            span = li.find("span", {"class": classname})
            if span is None:
                return ""
            return span.text

        div = soup.find("div", {"class": "wprm-recipe-name wprm-color-header"})
        title = div.text

        ingredients = {"ingredients": []}
        lis = soup.findAll("li", {"class": "wprm-recipe-ingredient"})
        for li in lis:
            amount = _get_span("wprm-recipe-ingredient-amount")
            unit = _get_span("wprm-recipe-ingredient-unit")
            name = _get_span("wprm-recipe-ingredient-name")
            notes = _get_span("wprm-recipe-ingredient-notes")
            ingredient = f"{amount} {unit} {name}({notes})"
            ingredients["ingredients"].append(ingredient)

        instructions = {"instructions": []}
        divs = soup.findAll("div", {"class": "wprm-recipe-instruction-text"})
        for div in divs:
            instructions["instructions"].append(div.text)

        tags = []
        tagsDiv = soup.find("div", {"class": "meta-bottom"})
        for a in tagsDiv.findAll("a", {"rel": "tag"}):
            tags.append(a.text)

        img_tags = soup.findAll("img", {"class": "size-full"})
        images = [img.attrs["src"] for img in img_tags if img.attrs["src"].startswith("https://aseasyasapplepie.com")]

        return Recipe(
            url=url,
            title=title,
            subtitle="",
            servings=0,
            ingredients=ingredients,
            instructions=instructions,
            tags=tags,
            images=images,
        )
Exemple #5
0
    def get_recipe_urls(self):

        recipe_urls = {}
        i = 1
        while True:
            list_url = f"https://www.bonappetit.com/search/?page={i}"
            content = get_cached(list_url)
            soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

            anchors = soup.find_all("a", href=True)
            for a in anchors:
                href = a["href"]
                if href.startswith("/"):
                    href = f"https://www.bonappetit.com{href}"
                if "/recipe/" in href and not recipe_urls.get(href, False):
                    recipe_urls[href] = True
                    yield href
            i += 1
Exemple #6
0
    def get_recipe_urls(self):

        recipe_urls = {}
        i = 1
        while True:
            list_url = f"https://www.epicurious.com/search?page={i}"
            content = get_cached(list_url)
            if "DON'T CRY!" in str(content):
                break
            soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

            anchors = soup.find_all("a", href=True)
            for a in anchors:
                href = a["href"]
                if href.startswith("/"):
                    href = f"https://www.epicurious.com{href}"
                if "/recipes/food/views/" in href and not recipe_urls.get(
                        href, False):
                    recipe_urls[href] = True
                    yield href
            i += 1
Exemple #7
0
    def get_recipe(self, url):
        original_url = url
        content = get_cached(url)
        content = content.decode('utf-8')
        soup = BeautifulSoup(content, 'html.parser')

        title = soup.find("h1", {"class": "headline"})
        if title is None:
            title = soup.find("h1", {"id": "recipe-main-content"})
        if title is None:
            return None
        title = str(title.text)

        ingredients = {"ingredients": []}
        lis = soup.findAll("li", {"class": "ingredients-item"})
        if len(lis) == 0:
            lis = soup.findAll("span", {"itemprop": "recipeIngredient"})
        for li in lis:
            s = clean_str(li.text)
            ingredients["ingredients"].append(s)

        steps = {"steps": []}
        divs = soup.findAll("div", {"class": "paragraph"})
        if len(divs) == 0:
            divs = soup.findAll("span", {"class": "recipe-directions__list--item"})
        for div in divs:
            step = clean_str(div.text)
            steps["steps"].append(step)

        servings = None
        divs = soup.findAll("div", {"class": "recipe-meta-item"})
        for div in divs:
            key = clean_str(div.find("div", {"class": "recipe-meta-item-header"}))
            val = clean_str(div.find("div", {"class": "recipe-meta-item-body"}))
            if key.lower() == "yield:":
                servings = clean_str(val.text)
                break

        images = []
        div = soup.find("div", {"class": "hero-photo__wrap"})
        if div is not None:
            a = div.find("a")
            imgs_content = get_cached(a.attrs["href"])
            imgs_content = imgs_content.decode('utf-8')
            for line in imgs_content.split("\n"):
                if "urls:" in line:
                    url = line.split("'")[1]
                    images.append(url)
        divs = soup.findAll("div", {"class": "lazy-image"})
        for div in divs:
            btn = div.find("button")
            if btn is not None:
                url = btn.attrs["data-image"]
                url = url.split("url=")[1]
                url = urllib.parse.unquote(url)
                images.append(url)
        recipe_div = soup.find("div", {"class": "recipe-content-container"})
        if recipe_div is not None:
            img = recipe_div.find("img")
            if img is not None:
                url = img.attrs["src"]
                try:
                    url = url.split("url=")[1]
                    url = url.split("?")[0]
                    url = urllib.parse.unquote(url)
                except:
                    pass
                images.append(url)
        images = [img for img in images if "media-allrecipes.com" in img]

        spans = soup.findAll("span", {"class": "breadcrumbs__title"})
        category = [span.text.strip("\n ") for span in spans]
        category = category[2:]
        return Recipe(
            url=original_url,
            title=title,
            subtitle="",
            ingredients=ingredients,
            instructions=steps,
            servings=servings,
            tags=clean_tags(category),
            images=images,
        )
Exemple #8
0
    def get_recipe(self, url):
        content = get_cached(url)
        if content is None:
            return None
        soup = BeautifulSoup(content.decode('utf-8'), 'html.parser')

        title = soup.find("h1", {"itemprop": "name"})
        try:
            title = str(title.text)
        except:
            title = url.split("/")[-1].replace("-", " ")

        ingredients = {}
        ingredient_groups = soup.findAll("ol", {"class": "ingredient-groups"})
        for group in ingredient_groups:
            group_title = soup.find("strong").text
            group_ingredients = []
            lis = group.findAll("li", {"class": "ingredient"})
            for li in lis:
                s = clean_str(li.text)
                group_ingredients.append(s)
            ingredients[group_title] = group_ingredients

        steps = {}
        preparation_groups = soup.findAll("ol",
                                          {"class": "preparation-groups"})
        for group in preparation_groups:
            group_title = soup.find("strong").text
            group_steps = []
            lis = group.findAll("li", {"class": "preparation-step"})
            for li in lis:
                step = clean_str(li.text)
                group_steps.append(step)
            steps[group_title] = group_steps

        tags = []
        dt = soup.find("dl", {"class": "tags"})
        if dt is not None:
            dts = dt.findAll("dt")
            tags = [str(dt.text) for dt in dts]

        servings = 1
        dd = soup.find("dd", {"class": "yield"})
        if dd is not None:
            servings = dd.text

        try:
            div = soup.find("div", {"class": "recipe-image"})
            img = div.find("meta", {"itemprop": "image"})
            image_url = img.attrs['content']
            image_urls = [image_url]
        except:
            image_urls = []

        recipe = Recipe(
            url=url,
            title=title,
            subtitle="",
            ingredients=ingredients,
            instructions=steps,
            servings=servings,
            tags=clean_tags(tags),
            images=image_urls,
        )
        return recipe