def extract_recipe_from_html(html: str, url: str) -> dict: try: scraped_recipes: List[dict] = scrape_schema_recipe.loads( html, python_objects=True) dump_last_json(scraped_recipes) if not scraped_recipes: scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url( url, python_objects=True) except Exception as e: print(e) scraped_recipes: List[dict] = scrape_schema_recipe.loads(html) dump_last_json(scraped_recipes) if not scraped_recipes: scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url(url) if scraped_recipes: new_recipe: dict = scraped_recipes[0] logger.info(f"Recipe Scraped From Web: {new_recipe}") if not new_recipe: return "fail" # TODO: Return Better Error Here new_recipe = Cleaner.clean(new_recipe, url) else: new_recipe = open_graph.basic_recipe_from_opengraph(html, url) logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}") return new_recipe
def process_recipe_url(url: str) -> dict: new_recipe: dict = scrape_url(url, python_objects=True)[0] logger.info(f"Recipe Scraped From Web: {new_recipe}") if not new_recipe: return "fail" # TODO: Return Better Error Here slug = slugify(new_recipe["name"]) mealie_tags = { "slug": slug, "orgURL": url, "categories": [], "tags": [], "dateAdded": None, "notes": [], "extras": [], } new_recipe.update(mealie_tags) try: img_path = scrape_image(new_recipe.get("image"), slug) new_recipe["image"] = img_path.name except: new_recipe["image"] = None return new_recipe
def get_recipe(): url = request.args.get('url') recipe_list = scrape_schema_recipe.scrape_url(url) if len(recipe_list) < 1: abort(Response("Sorry couldn't find a recipe on this page.")) else: return jsonify(recipe_list[0])
def extract(url, _): try: json_recipes = scrape_schema_recipe.scrape_url(url, python_objects=True) except: return None if len(json_recipes) == 0: return None json_recipe = json_recipes[0] tags = [] if "cookingMethod" in json_recipe: tags.append(json_recipe["cookingMethod"]) if "recipeCategory" in json_recipe: append_or_extend(tags, json_recipe["recipeCategory"]) if "recipeCuisine" in json_recipe: tags.append(json_recipe["recipeCuisine"]) if "keywords" in json_recipe: kw = json_recipe["keywords"] if isinstance(kw, str): kw = kw.split(',') append_or_extend(tags, kw) description_parts = [] if "description" in json_recipe: description_parts.append(json_recipe["description"]) if "image" in json_recipe: if isinstance(json_recipe["image"], list): description_parts.append(f'![]({json_recipe["image"][0]}")') else: description_parts.append(f'![]({json_recipe["image"]}")') yields = [] if "recipeYield" in json_recipe: yields.append(RecipeParser.parse_amount(json_recipe["recipeYield"])) recipe = Recipe( title=json_recipe["name"], description="\n\n".join(description_parts), tags=tags, yields=yields, ingredients=[ Ingredient(name=ingred) for ingred in json_recipe["recipeIngredient"] ], instructions= f'{create_instructions(json_recipe["recipeInstructions"])}\n\n{json_recipe["url"]}', ) return recipe
def getReccRecipes(self, content): self.ensureConnected() sql = "SELECT (inventoryID) FROM Users WHERE id = %s" val = (content['userID'], ) self.cursor.execute(sql, val) result = self.cursor.fetchall() inventoryID = result[0][0] sql = "SELECT (itemname) FROM Items WHERE inventoryID = %s ORDER BY expiration" val = (inventoryID, ) self.cursor.execute(sql, val) result = self.cursor.fetchall() searchUrl = 'https://www.foodnetwork.com/search/' for i in range(0, 5): if " " in result[i][0]: result[i][0].replace(' ', '-') searchUrl = searchUrl + "-" + result[i][0] searchUrl = searchUrl + '-' searchRequest = requests.get(searchUrl) soup = BeautifulSoup(searchRequest.text) temp = [] for link in soup.find_all('h3', 'm-MediaBlock__a-Headline'): recipeUrl = link.a.get('href') if 'recipes' in recipeUrl: url = "https:" + recipeUrl recipe_list = scrape_schema_recipe.scrape_url( url, python_objects=True) if len(recipe_list) != 0: recipe = { 'name': recipe_list[0]['name'], #'cookTime' : recipe_list[0]['cookTime'], 'recipeIngredient': recipe_list[0]['recipeIngredient'], 'recipeInstructions': recipe_list[0]['recipeInstructions'] } if 'cookTime' in recipe_list[0].keys(): recipe['cookTime'] = recipe_list[0]['cookTime'] temp.append(recipe) payload = {'data': temp} return (json.dumps(payload, default=str), 200)
def get_recipe(url): try: recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True) except: print('Could not scrape URL {}'.format(url)) return {} recipe = recipe_list[0] try: name = recipe['name'] print(name) except AttributeError: name = None try: ingredients = recipe['recipeIngredient'] except AttributeError: ingredients = None try: instructions = recipe['recipeInstructions'] except AttributeError: instructions = None try: author = recipe['author'] except AttributeError: author = None try: prepTime = recipe['prepTime'] except KeyError: prepTime = None try: calories = recipe['nutrition']['properties']['calories'] except AttributeError: calories = None return { 'name': name, 'ingredients': ingredients, 'instructions': instructions, 'prepTime': prepTime, 'calories': calories, 'author': author, }
def process_recipe_url(url: str) -> dict: new_recipe: dict = scrape_url(url, python_objects=True)[0] logger.info(f"Recipe Scraped From Web: {new_recipe}") if not new_recipe: return "fail" # TODO: Return Better Error Here new_recipe = process_recipe_data(new_recipe, url) try: img_path = scrape_image( normalize_image_url(new_recipe.get("image")), new_recipe.get("slug") ) new_recipe["image"] = img_path.name except: new_recipe["image"] = None return new_recipe
def scrape_search(list_link): ''' Input: (1) link to search page (2) recipe MongoDB Output: (1) list of data to be stored in MongoDB ''' #Parse url string to locate recipe name and number mongo_update_lst = [] for recipe_url in list_link: r = None try: r = scrape_schema_recipe.scrape_url(recipe_url, python_objects=True) except: print('Could not scrape URL {}'.format(recipe_url)) mongo_update_lst.append(r[0]) return mongo_update_lst
def extract_recipe_from_html(html: str, url: str) -> dict: scraped_recipes: List[dict] = scrape_schema_recipe.loads( html, python_objects=True) if not scraped_recipes: scraped_recipes: List[dict] = scrape_schema_recipe.scrape_url( url, python_objects=True) if scraped_recipes: new_recipe: dict = scraped_recipes[0] logger.info(f"Recipe Scraped From Web: {new_recipe}") if not new_recipe: return "fail" # TODO: Return Better Error Here new_recipe = process_recipe_data(new_recipe, url=url) new_recipe = normalize_data(new_recipe) else: new_recipe = basic_recipe_from_opengraph(html, url) logger.info(f"Recipe Scraped from opengraph metadata: {new_recipe}") return new_recipe
from rdflib import Graph, plugin import json, rdflib_jsonld from rdflib.plugin import register, Serializer register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer') url = 'https://www.foodnetwork.com/recipes/alton-brown/honey-mustard-dressing-recipe-1939031' url = 'https://foodnetwork.co.uk/recipes/honey-mustard-dressing/?utm_source=foodnetwork.com&utm_medium=domestic' url = 'https://www.chefkoch.de/rezepte/2569781402262652/Buchweizen-mit-Pilzen.html' url = 'https://www.allrecipes.com/recipe/246628/spaghetti-cacio-e-pepe/' url = 'https://dieseekocht.com/2019/04/30/pasta-con-le-sarde-original-rezept-aus-sizilien/' url = 'https://sz-magazin.sueddeutsche.de/das-rezept/kuerbis-mangold-quiche-mit-speck-und-apfel-89284/' url = 'https://www.brigitte.de/rezepte/herbstkueche--pilzrezepte--wir-feiern-den-herbst--10651814.html' url = 'https://www.epicurious.com/recipes/food/views/arepas-51245240' recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True) len(recipe_list) recipe = recipe_list[0] #test = recipe_list[1] type(recipe) len(recipe) for key in recipe: print(key) # Name of the recipe recipe['name'] # List of the Ingredients
def test_scrape_url(self): self.recipes = scrape_url(self.url) self.recipe = self.recipes[0] assert self.recipe["name"] == "Irish Coffee"
def test_scrape_url(self): with self.assertRaises(SSRTypeError): scrape_url(0xC0FFEE)
""" Helper script to download raw recipe data from a URL and dump it to disk. The resulting files can be used as test input data. """ import sys, json from scrape_schema_recipe import scrape_url for url in sys.argv[1:]: try: data = scrape_url(url)[0] slug = list(filter(None, url.split("/")))[-1] filename = f"{slug}.json" with open(filename, "w") as f: json.dump(data, f, indent=4, default=str) print(f"Saved {filename}") except Exception as e: print(f"Error for {url}: {e}")
def test_scrape_url(self): self.recipes = scrape_url(self.url) self.recipe = self.recipes[0] assert self.recipe['name'] == 'Irish Coffee'
def parse_recipe(): try: recipe_url = request.args.get('url') except: return make_response( 'Need a url in parameters. See <a href="/api">/api</a> for more info', 404) try: recipes = scrape_schema_recipe.scrape_url(recipe_url) if len(recipes) == 1 and recipes[0] is not None: recipe = recipes[0] if 'recipeInstructions' in recipe: ins = recipe['recipeInstructions'] if type(ins) == str: recipe['recipeInstructions'] = [html.escape(ins)] elif type(ins) == list and len(ins) > 0: if type(ins[0]) == dict: recipe['recipeInstructions'] = [] for item in ins: for k, v in item.items(): if k == 'text': recipe['recipeInstructions'].append( html.escape(v)) else: recipe['recipeInstructions'] = [ html.escape(i) for i in recipe['recipeInstructions'] ] if 'keywords' in recipe: recipe['keywords'] = [ html.escape(i.strip()) for i in recipe['keywords'].split(',') ] if 'image' in recipe: if type(recipe['image']) == dict: if 'url' in recipe['image']: recipe['image'] = recipe['image']['url'] if 'image' in recipe: if type(recipe['image']) == list: recipe['image'] = recipe['image'][-1] if 'author' in recipe: if type(recipe['author'] ) == dict and 'name' in recipe['author']: recipe['author'] = html.escape(recipe['author']['name']) if 'recipeYield' in recipe: rYield = recipe['recipeYield'] if type(rYield) == str: recipe['recipeYield'] = [ i.strip() for i in rYield.split(',') ][0] if type(rYield) == list and len(rYield) > 0: recipe['recipeYield'] = rYield[0] if 'cookTime' in recipe: recipe['cookTime'] = get_minutes(recipe['cookTime']) if 'prepTime' in recipe: recipe['prepTime'] = get_minutes(recipe['prepTime']) if 'totalTime' in recipe: recipe['totalTime'] = get_minutes(recipe['totalTime']) return recipe except Exception as e: print(e.args) pass try: recipe = scrape_me(recipe_url) to_return = { "@type": "noSchema", "name": recipe.title(), "url": recipe.url(), "recipeIngredients": recipe.ingredients(), "recipeInstructions": [i for i in recipe.instructions().split('\n') if i != ""], "review": recipe.reviews(), "aggregateRating": recipe.ratings(), "totalTime": recipe.total_time(), "recipeYield": recipe.yields(), "image": recipe.image() } return to_return except Exception as e: return make_response( f'Error processing request. That domain might not be in the list\ See <a href="/api">/api</a> for more info. Error: {e.args}', 500)
with open(r'./data/allrecipes_urls.txt') as f: urls = list(f.readlines()) # remove '\n' at end of each link and put in new list urls_cleaned = [url.strip('\n') for url in urls] # urls_subset = urls_cleaned[:5] # initalize empty list to store recipes recipes = [] # loop through list of cleaned urls for url in urls_cleaned: try: # scrape url and obtain page information recipe_list = scrape_schema_recipe.scrape_url(url) # get relevant information out of the page recipe = recipe_list[0] # transform time duration transform_time(recipe) # transform steps in recipe instructions transform_steps(recipe) # transform the data type for aggregate review transform_aggregate_review(recipe) # transform the data type for individual review transform_individual_review(recipe)
def enterlink(): form = EnterLinkForm() if form.validate_on_submit(): url = str(form.url.data) if Recipe.query.filter_by(source=url).all(): flash('That link has already been entered') return redirect(url_for('enterlink')) try: recipe_list = scrape_schema_recipe.scrape_url(url, python_objects=True) except: return redirect(url_for('linkfailed')) if recipe_list: link_recipe = recipe_list[0] try: recipe = Recipe(title=link_recipe['name']) except: flash('Something went wrong, try a different link') return redirect(url_for('enterlink')) recipe.source = url try: recipe.preptime = link_recipe['prepTime'] except: recipe.preptime = datetime.timedelta(minutes=0) try: recipe.cooktime = link_recipe['cookTime'] except: recipe.cooktime = datetime.timedelta(minutes=0) try: recipe.totaltime = link_recipe['totalTime'] except: recipe.totaltime = datetime.timedelta(minutes=0) try: recipe.serves = link_recipe['recipeYield'] except: recipe.serves = 'N/A' try: for line in link_recipe['recipeIngredient']: recipe.ingredients.append(Ingredient(line=line)) if type(link_recipe['recipeInstructions']) is str: recipe.directions.append( Directions(line=link_recipe['recipeInstructions'])) else: for line in link_recipe['recipeInstructions']: try: recipe.directions.append( Directions(line=line['text'])) except: if type(line) is str: recipe.directions.append(Directions(line=line)) else: recipe.directions.append( Directions( line='Could not parse directions.')) except: flash('Something went wrong, try a different link') return redirect(url_for('enterlink')) db.session.add(recipe) db.session.commit() return redirect( url_for('recipeentered', title=recipe.title, id=recipe.id)) return render_template('enterlink.html', form=form)