Exemple #1
0
def parse_json_from_data(data, url):
    """Parse recipe JSON data. Sometimes the recipe data is incomplete
    and is missing the instructions, so try up to three times.

    @param data: the HTML webpage data
    @ptype data: string
    @param url: the webpage URL
    @ptype url: string
    @return: the parsed JSON object, or None
    @rtype: dict or None
    """
    tries = 1
    while tries <= 3:
        match = JSON_RO.search(data)
        if match:
            json_string = match.group(1).strip()
            json_data = json.loads(json_string)
            if 'recipeInstructions' not in json_data:
                data, _ = getdatafromurl(url)
                tries += 1
            else:
                return json_data
        else:
            return None
    return None
Exemple #2
0
def test_parser(url):
    """Try to parse a recipe for testing purposes."""
    from pprint import pprint
    data, url = getdatafromurl(url)
    pprint(url)
    soup = BeautifulSoup(data, "html.parser")
    recipe = {}
    parse_schema_recipe(soup, recipe)
    recipe['image'] = recipe['image'][:10] + '...'
    pprint(recipe, indent=2)
    ingredients = parse_schema_ingredients(soup)
    while ingredients:
        group, inglist = ingredients.popitem(last=False)
        if inglist:
            pprint((group, inglist), indent=2)
Exemple #3
0
def test_parser(url):
    """Try to parse a recipe for testing purposes."""
    from pprint import pprint
    data, url = getdatafromurl(url)
    pprint(url)
    json_data = parse_json_from_data(data, url)
    if json_data:
        recipe = {}
        parse_schema_recipe(json_data, recipe)
        recipe['image'] = recipe['image'][:40] + '...'
        pprint(recipe, indent=2)
        ingredients = parse_schema_ingredients(json_data)
        pprint(ingredients, indent=2)
    else:
        raise ValueError("error parsing %s with data %r" % (url, data))
Exemple #4
0
def parse_schema_recipe(json_data, recipe):
    """Fill given recipe dict with data from JSON.
    @param json_data: the parsed JSON recipe schema data
    @ptype json_data: dict (with various keys and values)
    @param recipe: the gourmet recipe to fill
    @ptype recipe: dict
    @return: nothing, the recipe will be modified instead
    @rtype: None
    """
    recipe['title'] = json_data['name']
    if 'recipeCategory' in json_data:
        categories = json_data["recipeCategory"]
        if categories:
            # gourmet only has one category per recipe, so get the first one in the list
            recipe['category'] = categories[0]
            if len(categories) > 1:
                # If there are several categories add them to the modifications.
                recipe['modifications'] = "Kategorien: %s" % (
                    ", ".join(categories))
    try:
        cooktime = parse_iso8601_duration(json_data['cookTime'])
    except ValueError:
        warn("could not parse cookTime %r" % json_data['cookTime'])
    else:
        recipe['cooktime'] = cooktime
    try:
        preptime = parse_iso8601_duration(json_data['prepTime'])
    except ValueError:
        warn("could not parse prepTime %r" % json_data['prepTime'])
    else:
        recipe['preptime'] = preptime
    recipe['instructions'] = json_data['recipeInstructions']
    recipe['yields'] = json_data['recipeYield']
    if 'aggregateRating' in json_data:
        rating = float(json_data['aggregateRating']['ratingValue'])
        # adjust "1 to 5" rating of chefkoch to "1 to 10" of gourmet
        recipe['rating'] = int(rating * 2)
    image = json_data['image']
    if image:
        if isinstance(image, list):
            image = image[0]
        recipe['image'], _ = getdatafromurl(image, content_type_check="image/")
Exemple #5
0
def parse_schema_recipe(soup, recipe):
    """Fill given recipe dict with data from HTML.

    @param soup: the parsed HTML data from BeautifulSoup
    @ptype soup: BeautifulSoup.Tag
    @param recipe: the gourmet recipe to fill
    @ptype recipe: dict
    @return: nothing, the recipe will be modified instead
    @rtype: None
    """
    nonempty = re.compile(r".+")
    tag = soup.find(itemprop="name", content=nonempty)
    if tag:
        recipe['title'] = tag["content"]
    tag = soup.find(itemprop="recipeCategory")
    if tag:
        recipe['category'] = tag.text
    tag = soup.find(itemprop="ratingValue", content=nonempty)
    if tag:
        rating = float(tag["content"])
        # adjust "1 to 5" rating to "1 to 10" of gourmet
        recipe['rating'] = int(rating * 2)
    tag = soup.find(itemprop="image", src=nonempty)
    if tag:
        image = tag["src"]
        recipe['image'], unused = getdatafromurl(image,
                                                 content_type_check="image/")
    preptime = 0
    tag = soup.find(itemprop="performTime", content=nonempty)
    if tag:
        try:
            preptime = parse_iso8601_duration(tag['content'])
        except ValueError:
            warn("could not parse prepTime %r" % tag['content'])
        else:
            recipe['preptime'] = preptime
    tag = soup.find(itemprop="totalTime", content=nonempty)
    if tag:
        try:
            totaltime = parse_iso8601_duration(tag['content'])
        except ValueError:
            warn("could not parse prepTime %r" % tag['content'])
        else:
            # the cooking time is the difference between total and prep time
            recipe['cooktime'] = totaltime - preptime
    tag = soup.find(itemprop="recipeYield")
    if tag and tag.text:
        recipe['yields'] = tag.text.strip()
    tag = soup.find(itemprop="description")
    if tag:
        # replace images in the text with alternative text since they
        # are sometimes used in place
        for img in tag.find_all('img'):
            img.string = translate_image_text("[" + get_alt_text(img) + "]")
        recipe['instructions'] = tag.get_text()
    tag = soup.find("div", attrs={"class": "tips"})
    if tag:
        p = tag.find("p")
        if p:
            recipe['modifications'] = p.text.strip()
    categories = soup.find_all(id="recipesCatFilterLink")
    if categories:
        cattext = ", ".join(cat.text.strip() for cat in categories)
        text = "%s %s" % (_("Category:"), cattext)
        if "modifications" in recipe:
            recipe["modifications"] += "\n\n" + text
        else:
            recipe["modifications"] = text