Python scrape_url Examples, scrape_schema_recipe.scrape_url Python Examples

Example #1

0

Show file

File: scraper.py Project: jeefberkey/mealie

def extract_recipe_from_html(html: str, url: str) -> dict:
    try:
        scraped_recipes: List[dict] = scrape_schema_recipe.loads(
            html, python_objects=True)
        dump_last_json(scraped_recipes)

        if not scraped_recipes:
            scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(
                url, python_objects=True)
    except Exception as e:
        print(e)
        scraped_recipes: List[dict] = scrape_schema_recipe.loads(html)
        dump_last_json(scraped_recipes)

        if not scraped_recipes:
            scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url)

    if scraped_recipes:
        new_recipe: dict = scraped_recipes[0]
        logger.info(f"Recipe Scraped From Web: {new_recipe}")

        if not new_recipe:
            return "fail"  # TODO: Return Better Error Here

        new_recipe = Cleaner.clean(new_recipe, url)
    else:
        new_recipe = open_graph.basic_recipe_from_opengraph(html, url)
        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")

    return new_recipe

Example #2

0

Show file

def process_recipe_url(url: str) -> dict:
    new_recipe: dict = scrape_url(url, python_objects=True)[0]
    logger.info(f"Recipe Scraped From Web: {new_recipe}")

    if not new_recipe:
        return "fail"  # TODO: Return Better Error Here

    slug = slugify(new_recipe["name"])
    mealie_tags = {
        "slug": slug,
        "orgURL": url,
        "categories": [],
        "tags": [],
        "dateAdded": None,
        "notes": [],
        "extras": [],
    }

    new_recipe.update(mealie_tags)

    try:
        img_path = scrape_image(new_recipe.get("image"), slug)
        new_recipe["image"] = img_path.name
    except:
        new_recipe["image"] = None

    return new_recipe

Example #3

0

Show file

def get_recipe():
    url = request.args.get('url')
    recipe_list = scrape_schema_recipe.scrape_url(url)
    if len(recipe_list) < 1:
        abort(Response("Sorry couldn't find a recipe on this page."))
    else:
        return jsonify(recipe_list[0])

Example #4

0

Show file

def extract(url, _):
    try:
        json_recipes = scrape_schema_recipe.scrape_url(url,
                                                       python_objects=True)
    except:
        return None

    if len(json_recipes) == 0:
        return None
    json_recipe = json_recipes[0]

    tags = []
    if "cookingMethod" in json_recipe:
        tags.append(json_recipe["cookingMethod"])
    if "recipeCategory" in json_recipe:
        append_or_extend(tags, json_recipe["recipeCategory"])
    if "recipeCuisine" in json_recipe:
        tags.append(json_recipe["recipeCuisine"])
    if "keywords" in json_recipe:
        kw = json_recipe["keywords"]
        if isinstance(kw, str):
            kw = kw.split(',')
        append_or_extend(tags, kw)

    description_parts = []
    if "description" in json_recipe:
        description_parts.append(json_recipe["description"])
    if "image" in json_recipe:
        if isinstance(json_recipe["image"], list):
            description_parts.append(f'![]({json_recipe["image"][0]}")')
        else:
            description_parts.append(f'![]({json_recipe["image"]}")')

    yields = []
    if "recipeYield" in json_recipe:
        yields.append(RecipeParser.parse_amount(json_recipe["recipeYield"]))

    recipe = Recipe(
        title=json_recipe["name"],
        description="\n\n".join(description_parts),
        tags=tags,
        yields=yields,
        ingredients=[
            Ingredient(name=ingred)
            for ingred in json_recipe["recipeIngredient"]
        ],
        instructions=
        f'{create_instructions(json_recipe["recipeInstructions"])}\n\n{json_recipe["url"]}',
    )

    return recipe

Example #5

0

Show file

    def getReccRecipes(self, content):
        self.ensureConnected()
        sql = "SELECT (inventoryID) FROM Users WHERE id = %s"
        val = (content['userID'], )
        self.cursor.execute(sql, val)
        result = self.cursor.fetchall()

        inventoryID = result[0][0]

        sql = "SELECT (itemname) FROM Items WHERE inventoryID = %s ORDER BY expiration"
        val = (inventoryID, )
        self.cursor.execute(sql, val)
        result = self.cursor.fetchall()

        searchUrl = 'https://www.foodnetwork.com/search/'

        for i in range(0, 5):
            if " " in result[i][0]:
                result[i][0].replace(' ', '-')
            searchUrl = searchUrl + "-" + result[i][0]
        searchUrl = searchUrl + '-'

        searchRequest = requests.get(searchUrl)
        soup = BeautifulSoup(searchRequest.text)

        temp = []
        for link in soup.find_all('h3', 'm-MediaBlock__a-Headline'):
            recipeUrl = link.a.get('href')
            if 'recipes' in recipeUrl:
                url = "https:" + recipeUrl
                recipe_list = scrape_schema_recipe.scrape_url(
                    url, python_objects=True)

                if len(recipe_list) != 0:
                    recipe = {
                        'name': recipe_list[0]['name'],
                        #'cookTime' : recipe_list[0]['cookTime'],
                        'recipeIngredient': recipe_list[0]['recipeIngredient'],
                        'recipeInstructions':
                        recipe_list[0]['recipeInstructions']
                    }
                    if 'cookTime' in recipe_list[0].keys():
                        recipe['cookTime'] = recipe_list[0]['cookTime']

                    temp.append(recipe)

        payload = {'data': temp}

        return (json.dumps(payload, default=str), 200)

Example #6

0

Show file

File: application.py Project: sherina0922/flask-aws-tutorial

def get_recipe(url):
    try:
        recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True)
    except:
        print('Could not scrape URL {}'.format(url))
        return {}

    recipe = recipe_list[0]
    try:
        name = recipe['name']
        print(name)
    except AttributeError:
        name = None

    try:
        ingredients = recipe['recipeIngredient']
    except AttributeError:
        ingredients = None

    try:
        instructions = recipe['recipeInstructions']
    except AttributeError:
        instructions = None

    try:
        author = recipe['author']
    except AttributeError:
        author = None

    try:
        prepTime = recipe['prepTime']
    except KeyError:
        prepTime = None

    try:
        calories = recipe['nutrition']['properties']['calories']
    except AttributeError:
        calories = None

    return {
        'name': name,
        'ingredients': ingredients,
        'instructions': instructions,
        'prepTime': prepTime,
        'calories': calories,
        'author': author,
    }

Example #7

0

Show file

File: scrape_services.py Project: djbr22/mealie

def process_recipe_url(url: str) -> dict:
    new_recipe: dict = scrape_url(url, python_objects=True)[0]
    logger.info(f"Recipe Scraped From Web: {new_recipe}")

    if not new_recipe:
        return "fail"  # TODO: Return Better Error Here

    new_recipe = process_recipe_data(new_recipe, url)

    try:
        img_path = scrape_image(
            normalize_image_url(new_recipe.get("image")), new_recipe.get("slug")
        )
        new_recipe["image"] = img_path.name
    except:
        new_recipe["image"] = None

    return new_recipe

Example #8

0

Show file

File: application.py Project: sherina0922/flask-aws-tutorial

def scrape_search(list_link):
    '''
    Input:  (1) link to search page
            (2) recipe MongoDB
    Output: (1) list of data to be stored in MongoDB
    '''

    #Parse url string to locate recipe name and number

    mongo_update_lst = []
    for recipe_url in list_link:
        r = None
        try:
            r = scrape_schema_recipe.scrape_url(recipe_url,
                                                python_objects=True)
        except:
            print('Could not scrape URL {}'.format(recipe_url))
        mongo_update_lst.append(r[0])
    return mongo_update_lst

Example #9

0

Show file

File: scrape_services.py Project: richardmitic/mealie

def extract_recipe_from_html(html: str, url: str) -> dict:
    scraped_recipes: List[dict] = scrape_schema_recipe.loads(
        html, python_objects=True)

    if not scraped_recipes:
        scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(
            url, python_objects=True)

    if scraped_recipes:
        new_recipe: dict = scraped_recipes[0]
        logger.info(f"Recipe Scraped From Web: {new_recipe}")

        if not new_recipe:
            return "fail"  # TODO: Return Better Error Here

        new_recipe = process_recipe_data(new_recipe, url=url)
        new_recipe = normalize_data(new_recipe)
    else:
        new_recipe = basic_recipe_from_opengraph(html, url)
        logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}")

    return new_recipe

Example #10

0

Show file

File: scraping.py Project: TobReg/NLP_LegalDocs

from rdflib import Graph, plugin
import json, rdflib_jsonld
from rdflib.plugin import register, Serializer

register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')

url = 'https://www.foodnetwork.com/recipes/alton-brown/honey-mustard-dressing-recipe-1939031'
url = 'https://foodnetwork.co.uk/recipes/honey-mustard-dressing/?utm_source=foodnetwork.com&utm_medium=domestic'
url = 'https://www.chefkoch.de/rezepte/2569781402262652/Buchweizen-mit-Pilzen.html'
url = 'https://www.allrecipes.com/recipe/246628/spaghetti-cacio-e-pepe/'
url = 'https://dieseekocht.com/2019/04/30/pasta-con-le-sarde-original-rezept-aus-sizilien/'
url = 'https://sz-magazin.sueddeutsche.de/das-rezept/kuerbis-mangold-quiche-mit-speck-und-apfel-89284/'
url = 'https://www.brigitte.de/rezepte/herbstkueche--pilzrezepte--wir-feiern-den-herbst--10651814.html'
url = 'https://www.epicurious.com/recipes/food/views/arepas-51245240'

recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True)
len(recipe_list)

recipe = recipe_list[0]
#test = recipe_list[1]
type(recipe)

len(recipe)

for key in recipe:
    print(key)

# Name of the recipe
recipe['name']

# List of the Ingredients

Example #11

0

Show file

 def test_scrape_url(self):
     self.recipes = scrape_url(self.url)
     self.recipe = self.recipes[0]
     assert self.recipe["name"] == "Irish Coffee"

Example #12

0

Show file

 def test_scrape_url(self):
     with self.assertRaises(SSRTypeError):
         scrape_url(0xC0FFEE)

Example #13

0

Show file

File: scrape_recipe.py Project: jeremyandrews/mealie

"""
Helper script to download raw recipe data from a URL and dump it to disk.
The resulting files can be used as test input data.
"""

import sys, json
from scrape_schema_recipe import scrape_url

for url in sys.argv[1:]:
    try:
        data = scrape_url(url)[0]
        slug = list(filter(None, url.split("/")))[-1]
        filename = f"{slug}.json"
        with open(filename, "w") as f:
            json.dump(data, f, indent=4, default=str)
        print(f"Saved {filename}")
    except Exception as e:
        print(f"Error for {url}: {e}")

Example #14

0

Show file

File: test_scrape.py Project: throttleup/scrape-schema-recipe

 def test_scrape_url(self):
     self.recipes = scrape_url(self.url)
     self.recipe = self.recipes[0]
     assert self.recipe['name'] == 'Irish Coffee'

Example #15

0

Show file

File: app.py Project: boonepeter/recipe-parser

def parse_recipe():
    try:
        recipe_url = request.args.get('url')
    except:
        return make_response(
            'Need a url in parameters. See <a href="/api">/api</a> for more info',
            404)
    try:
        recipes = scrape_schema_recipe.scrape_url(recipe_url)
        if len(recipes) == 1 and recipes[0] is not None:
            recipe = recipes[0]
            if 'recipeInstructions' in recipe:
                ins = recipe['recipeInstructions']
                if type(ins) == str:
                    recipe['recipeInstructions'] = [html.escape(ins)]
                elif type(ins) == list and len(ins) > 0:
                    if type(ins[0]) == dict:
                        recipe['recipeInstructions'] = []
                        for item in ins:
                            for k, v in item.items():
                                if k == 'text':
                                    recipe['recipeInstructions'].append(
                                        html.escape(v))
                    else:
                        recipe['recipeInstructions'] = [
                            html.escape(i)
                            for i in recipe['recipeInstructions']
                        ]
            if 'keywords' in recipe:
                recipe['keywords'] = [
                    html.escape(i.strip())
                    for i in recipe['keywords'].split(',')
                ]
            if 'image' in recipe:
                if type(recipe['image']) == dict:
                    if 'url' in recipe['image']:
                        recipe['image'] = recipe['image']['url']
            if 'image' in recipe:
                if type(recipe['image']) == list:
                    recipe['image'] = recipe['image'][-1]
            if 'author' in recipe:
                if type(recipe['author']
                        ) == dict and 'name' in recipe['author']:
                    recipe['author'] = html.escape(recipe['author']['name'])
            if 'recipeYield' in recipe:
                rYield = recipe['recipeYield']
                if type(rYield) == str:
                    recipe['recipeYield'] = [
                        i.strip() for i in rYield.split(',')
                    ][0]
                if type(rYield) == list and len(rYield) > 0:
                    recipe['recipeYield'] = rYield[0]
            if 'cookTime' in recipe:
                recipe['cookTime'] = get_minutes(recipe['cookTime'])
            if 'prepTime' in recipe:
                recipe['prepTime'] = get_minutes(recipe['prepTime'])
            if 'totalTime' in recipe:
                recipe['totalTime'] = get_minutes(recipe['totalTime'])
            return recipe
    except Exception as e:
        print(e.args)
        pass

    try:
        recipe = scrape_me(recipe_url)
        to_return = {
            "@type":
            "noSchema",
            "name":
            recipe.title(),
            "url":
            recipe.url(),
            "recipeIngredients":
            recipe.ingredients(),
            "recipeInstructions":
            [i for i in recipe.instructions().split('\n') if i != ""],
            "review":
            recipe.reviews(),
            "aggregateRating":
            recipe.ratings(),
            "totalTime":
            recipe.total_time(),
            "recipeYield":
            recipe.yields(),
            "image":
            recipe.image()
        }
        return to_return
    except Exception as e:
        return make_response(
            f'Error processing request. That domain might not be in the list\
             See <a href="/api">/api</a> for more info. Error: {e.args}', 500)

Example #16

0

Show file

with open(r'./data/allrecipes_urls.txt') as f:
    urls = list(f.readlines())

# remove '\n' at end of each link and put in new list
urls_cleaned = [url.strip('\n') for url in urls]
# urls_subset = urls_cleaned[:5]

# initalize empty list to store recipes
recipes = []

# loop through list of cleaned urls
for url in urls_cleaned:

    try:
        # scrape url and obtain page information
        recipe_list = scrape_schema_recipe.scrape_url(url)

        # get relevant information out of the page
        recipe = recipe_list[0]

        # transform time duration
        transform_time(recipe)

        # transform steps in recipe instructions
        transform_steps(recipe)

        # transform the data type for aggregate review
        transform_aggregate_review(recipe)

        # transform the data type for individual review
        transform_individual_review(recipe)

Example #17

0

Show file

def enterlink():
    form = EnterLinkForm()
    if form.validate_on_submit():
        url = str(form.url.data)
        if Recipe.query.filter_by(source=url).all():
            flash('That link has already been entered')
            return redirect(url_for('enterlink'))
        try:
            recipe_list = scrape_schema_recipe.scrape_url(url,
                                                          python_objects=True)
        except:
            return redirect(url_for('linkfailed'))
        if recipe_list:
            link_recipe = recipe_list[0]
            try:
                recipe = Recipe(title=link_recipe['name'])
            except:
                flash('Something went wrong, try a different link')
                return redirect(url_for('enterlink'))
            recipe.source = url
            try:
                recipe.preptime = link_recipe['prepTime']
            except:
                recipe.preptime = datetime.timedelta(minutes=0)
            try:
                recipe.cooktime = link_recipe['cookTime']
            except:
                recipe.cooktime = datetime.timedelta(minutes=0)
            try:
                recipe.totaltime = link_recipe['totalTime']
            except:
                recipe.totaltime = datetime.timedelta(minutes=0)
            try:
                recipe.serves = link_recipe['recipeYield']
            except:
                recipe.serves = 'N/A'
            try:
                for line in link_recipe['recipeIngredient']:
                    recipe.ingredients.append(Ingredient(line=line))
                if type(link_recipe['recipeInstructions']) is str:
                    recipe.directions.append(
                        Directions(line=link_recipe['recipeInstructions']))
                else:
                    for line in link_recipe['recipeInstructions']:

                        try:
                            recipe.directions.append(
                                Directions(line=line['text']))
                        except:
                            if type(line) is str:
                                recipe.directions.append(Directions(line=line))
                            else:
                                recipe.directions.append(
                                    Directions(
                                        line='Could not parse directions.'))
            except:
                flash('Something went wrong, try a different link')
                return redirect(url_for('enterlink'))
            db.session.add(recipe)
            db.session.commit()
            return redirect(
                url_for('recipeentered', title=recipe.title, id=recipe.id))
    return render_template('enterlink.html', form=form)