def clean_item(old_dict): # copy this so we have an unmodified version source_dict = dict(old_dict) # remove ts and _id fields from what we pass to loader del source_dict['ts'] del source_dict['_id'] if VERBOSE: print "Examining '%s' from '%s' (%s)..." % ( old_dict['name'], old_dict['source'], old_dict['_id']) loader = RecipeItemLoader(RecipeItem()) for k, v in source_dict.iteritems(): loader = set_value(loader, k, v) new_item = loader.load_item() return new_item, source_dict
def parse_item(self, response): hxs = HtmlXPathSelector(response) data = {"url": response.url, "source": self.source} recipe = RecipeItem.from_dict(parse_recipe(hxs, data)) loader = RecipeItemLoader(item=recipe) loader.add_value("image", select_class(hxs, "post_image").select("@src").extract()) loader.add_value("description", hxs.select('//meta[@name="description"]/@content').extract()) loader.add_value("name", select_class(hxs, "entry-title").select("text()").extract()) return [loader.load_item()]
def parse_item(self, response): hxs = HtmlXPathSelector(response) data = {'url': response.url, 'source': self.source} recipe = RecipeItem.from_dict(parse_recipe(hxs, data)) loader = RecipeItemLoader(item=recipe) loader.add_value('image', select_class(hxs, 'post_image').select('@src').extract()) loader.add_value('description', hxs.select('//meta[@name="description"]/@content').extract()) loader.add_value('name', select_class(hxs, 'entry-title').select('text()').extract()) return loader.load_item()
def clean_item(old_dict): # copy this so we have an unmodified version source_dict = dict(old_dict) # remove ts and _id fields from what we pass to loader del source_dict['ts'] del source_dict['_id'] if VERBOSE: print "Examining '%s' from '%s' (%s)..." % (old_dict['name'], old_dict['source'], old_dict['_id']) loader = RecipeItemLoader(RecipeItem()) for k, v in source_dict.iteritems(): loader = set_value(loader, k, v) new_item = loader.load_item() return new_item, source_dict
def parse_item(self, response): hxs = HtmlXPathSelector(response) data = {'url': response.url, 'source': self.source} recipe = RecipeItem.from_dict(parse_recipe(hxs, data)) loader = RecipeItemLoader(item=recipe) loader.add_value( 'image', select_class(hxs, 'post_image').select('@src').extract()) loader.add_value( 'description', hxs.select('//meta[@name="description"]/@content').extract()) loader.add_value( 'name', select_class(hxs, 'entry-title').select('text()').extract()) return [loader.load_item()]
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//blockquote[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="summary"]/p/text()' image_path = '//img[@class="photo"]/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = '//*[@class="ingredient"]/p/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="innerrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '*//h2[@class="fn"]/text()' image_path = '*//img[@class="photo"]/@src' prepTime_path = '*//span[@class="preptime"]/text()' cookTime_path = '*//span[@class="cooktime"]/text()' totalTime_path = '*//span[@class="duration"]/text()' recipeYield_path = '*//span[@class="yield"]/text()' datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]' ingredients_path = '*//*[@class="ingredient"]/p' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value("source", self.source) il.add_value("name", r_scope.select(name_path).extract()) il.add_value("image", r_scope.select(image_path).extract()) il.add_value("url", response.url) il.add_value("prepTime", r_scope.select(prepTime_path).extract()) il.add_value("cookTime", r_scope.select(cookTime_path).extract()) il.add_value("totalTime", r_scope.select(totalTime_path).extract()) il.add_value("recipeYield", r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select("text()").extract() ingredient = "".join(ingredient) ingredients.append(ingredient) il.add_value("ingredients", ingredients) il.add_value("datePublished", r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//*[@class="recipe"]""" recipes_scope = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"]/@content' recipeYield_path = './p[1]/text()' ingredients_path = './p[1][br]|./p[2][br]' datePublished = '//time[@class="entry-date"]/@datetime' recipes = [] for r_scope in recipes_scope: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) recipe_yield = r_scope.select(recipeYield_path).extract() # Add values if `extract()` string contains "serves" if "".join(recipe_yield).find('serves') >= 0: il.add_value('recipeYield', "".join(recipe_yield)) ingredients_scope = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredients_scope: ingredient = i_scope.select('./text()').extract() ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the # recipe info base_path = """//div[@itemtype="http://schema.org/Recipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()' description_path = '//div[@itemprop="description"]/text()' image_path = '//img[1]/@src' prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title' cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title' recipeYield_path = '//span[@itemprop="recipeYield"]/text()' ingredients_path = '//li[@itemprop="ingredients"]/text()' datePublished = '//abbr[@class="published"]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) # There's a bunch of images for each recipe, so we just # grab the first. il.add_value('image', r_scope.select(image_path).extract()[1]) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = './/*[@class="fn"]/text()' #desription is pretty odd on this site. #description_path = 'TODO' image_path = '//div/p[1]//img/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/p/text()' #the formatting is odd, will need to learn more xpath to be able to select, just date datePublished = '//*[@class="postmeta"]/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) #il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@id="content"]' recipes_scopes = hxs.select(base_path) name_path = './/span[@class="item"]/h2[@class="fn"]/text()' image_path = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' size-full ')][1]/@src" prepTime_path = './/span[@class="preptime"]/text()' cookTime_path = './/span[@class="cooktime"]/text()' recipeYield_path = './/span[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/p/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract() raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url}) if raw_recipes: # schema.org. Yay! for recipe in raw_recipes: recipe['image'] = image_path return [RecipeItem.from_dict(recipe) for recipe in raw_recipes] else: # not schema.org. Boo! il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('url', response.url) il.add_value('image', image_path) name_path = '//*[@class="post-title"]/h1/text()' il.add_value('name', hxs.select(name_path).extract()) # maybe it's in the P's for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'): if is_ingredient_container(p): il.add_value('ingredients', p.select('text()').extract()) # or maybe it's in the LI's for li in hxs.select('//*[@class="span9"]//ul/li'): if is_ingredient_container(li): il.add_value('ingredients', li.select('text()').extract()) # or maybe it's in these other LI's for li in hxs.select('//li[@class="ingredient"]/text()'): il.add_value('ingredients', li.extract()) return il.load_item()
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//div[@itemtype="http://data-vocabulary.org/Recipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@datetime' cookTime_path = '*//*[@itemprop="cookTime"]/@datetime' recipeYield_path = '*//*[@itemprop="yield"]/text()' ingredients_path = '*//*[@itemprop="ingredient"]' datePublished = '*/*[@itemprop="published"]/@datetime' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItemLoader il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # ingredients require more work on this site to extract. We first # get the base elements, and then loop through to pull out each # "amount" and "name." Then we build a single string to represent # each one and append it to the array of ingredients ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('*[@itemprop="amount"]/text()').extract() name = i_scope.select('*[@itemprop="name"]/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) # il.load_item() returns a RecipeItem passed through the # RecipeItemLoader's property formatters. Apppend the RecipeItem # to the recipes list recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = './/*[@class="fn"]/text()' #desription is pretty odd on this site. #description_path = 'TODO' image_path = '//div/p[1]//img/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/p/text()' #the formatting is odd, will need to learn more xpath to be able to select, just date datePublished = '//*[@class="postmeta"]/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) #il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match(ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) # site has many recipes missing the semantmic markup, but not worth # pursuing those IMHO. use hrecipe base_path = """//*[@class="hrecipe"]""" recipes_scopes = hxs.select(base_path) name_path = './/*[@class="fn"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' recipeYield_path = './/*[@class="yield"]/text()' ingredients_path = '*//*[@class="ingredient"]' # get the date from rest of page, not under hrecipe datePublished_path = '//*[@class="date"][1]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('*[@class="amount"]/text()').extract() name = i_scope.select('*[@class="name"]/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item_alt1(self, response): hxs = HtmlXPathSelector(response) base_path = """//blockquote""" recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' # just grab the first image we can find image_path = '//div[@class="post"]/p[1]/img/@src' ypc_path = './/p/text()[starts-with(.,"Yields")]' # ingredients always seems to follow the ypc block ingredients_path = './/p[starts-with(text(),"Yields")]/following-sibling::p[1]' datePublished_path = '//meta[@property="article:published_time"]/@content' dateModified_path = '//meta[@property="article:modified_time"]/@content' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) # might be able to make this bit more robust, which would probably # let us hit more recipes on this site. Not terribly motivated, tho ypc_str = "".join(r_scope.select(ypc_path).extract()) yield_match = re.match(r'Yields?:?\s([^|]+)', ypc_str, re.I) prep_match = re.match(r'.+Prep(?: Time)?:?\s([^|]+)', ypc_str, re.I) cook_match = re.match(r'.+Cook Time:?\s([^|]+)', ypc_str, re.I) if yield_match: il.add_value('recipeYield', yield_match.group(1)) if prep_match: il.add_value('prepTime', prep_match.group(1)) if cook_match: il.add_value('cookTime', cook_match.group(1)) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) il.add_value('dateModified', r_scope.select(dateModified_path).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): """ this site is a mess, with LOTS of inconsistencies in formatting. We will try one other approach (in parse_item_alt1), but spending a bunch of time to get their old recipes seems like a waste. """ # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//blockquote[@class="recipe hrecipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # if we don't find anything, try the alt parser if len(recipes_scopes) < 1: self.log('calling alternate delishhh.com scraper') return self.parse_item_alt1(response) name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = './/*[@class="preptime"]/text()' cookTime_path = './/*[@class="cooktime"]/text()' recipeYield_path = './/*[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/*' datePublished_path = '//meta[@property="article:published_time"]/@content' dateModified_path = '//meta[@property="article:modified_time"]/@content' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) il.add_value('dateModified', r_scope.select(dateModified_path).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//span[@class="hrecipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//div[@class="content"]/header/h1[@class="fn"]/text()' description_path = '//article[@class="recipe_description"]//text()' image_path = '//div[@class="recipe_image_main"]/p/img/@src' recipeYield_path = '//div[@class="recipe_meta"]/p/span[contains(@class,"yield")]/text()' ingredients_path = '//article[@class="ingredients"]//ul//li/p[@class="ingredient"]/span[@class="value"]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', urljoin(response.url, r_scope.select(image_path).extract().pop(0))) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # prepTime not available il.add_value('prepTime', None) # cookTime not available il.add_value('cookTime', None) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) # datePublished not available il.add_value('datePublished', None) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@itemprop="name"]/text()' description_path = '//*[@itemprop="description"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' prepTime_path = '//*[@itemprop="prepTime"]' cookTime_path = '//*[@itemprop="cookTime"]' ingredients_path = '//*[@itemprop="ingredients"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', 'allrecipes') il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) prepTime = r_scope.select(prepTime_path) il.add_value('prepTime', parse_iso_date(prepTime)) cookTime = r_scope.select(cookTime_path) il.add_value('cookTime', parse_iso_date(cookTime)) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: components = i_scope.select('node()/text()').extract() ingredients.append(' '.join(components)) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) # blockquote containing the recipe has multiple classes # specify that it must contain the class hrecipe base_path = """//blockquote[contains(concat(' ', normalize-space(@class), ' '), ' hrecipe ')]""" recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="summary"]/p/text()' image_path = '//img[@class="photo"]/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' totalTime_path = '//*[@class="duration"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = '//*[@class="ingredient"]/p/text() | //*[@class="ingredient"]/span/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('url', response.url) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('totalTime', r_scope.select(totalTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//*[contains(concat(' ', normalize-space(@class), ' '), ' hrecipe ')]""" recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="recipe-description summary"]/p/text()' image_path = '//img[@class="photo"]/@src' recipeYield_path = '//*[@class="directions"]/p/text()' ingredients_path = '//*[@class="ingredient"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # yield given somewhere in description 'Serves n.' il.add_value('recipeYield', r_scope.select(recipeYield_path).re('Serves \d\.')) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//*[@id="container"]/*[@class="onepage"]/div[1]/div[@class="content"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@class="title"]/text() | //*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[strong]/strong/text()' image_path = '//*[@class="content"]/p[1]/img[contains(@class, "size-full")]/@src' recipeYield_path = '//*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[em and strong]/em/text()' datePublished = '//*[@class="phn-date"]/a[@rel="author"]/following-sibling::text()' # This site contains Ingredients and Garnishes, both "lists" are inside a <p> and separated # using <br>s. Also, we skip the <p> containing "EVENT VENUE PARTY SIZE TYPE MENU" by # grabbing <p>s that do not have <strong>, <a>, or <img> child elements ingredients_path = '//*[@class="content"]/p[not(strong or a or img) and br]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # date returns something like this: "ON SATURDAY NOV 28TH, 2009 |" date = r_scope.select(datePublished).extract() if len(date) > 0: date = date[0].replace('on', '', 1).replace('|', '').strip() il.add_value('datePublished', date) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//div[@itemtype="http://data-vocabulary.org/Recipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@datetime' cookTime_path = '*//*[@itemprop="cookTime"]/@datetime' recipeYield_path = '*//*[@itemprop="yield"]/text()' ingredients_path = '*//*[@itemprop="ingredient"]' datePublished = '*/*[@itemprop="published"]/@datetime' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItemLoader il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # ingredients require more work on this site to extract. We first # get the base elements, and then loop through to pull out each # "amount" and "name." Then we build a single string to represent # each one and append it to the array of ingredients ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select( '*[@itemprop="amount"]/text()').extract() name = i_scope.select('*[@itemprop="name"]/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) # il.load_item() returns a RecipeItem passed through the # RecipeItemLoader's property formatters. Apppend the RecipeItem # to the recipes list recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//div[@class="recipe-details"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//label[@for="set_servings"]/input/@value' description_path = '//span[@itemprop="summary"]/p/text()' image_path = '//img[@class="the_recipe_image"]/@src' cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime' prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime' ingredients_path = '//span[@itemprop="ingredient"]' ingredients_amounts_path = './span[@itemprop="amount"]/text()' ingredients_names_path = './span[@itemprop="name"]/text()' datePublished_path = '//span[@itemprop="published"]/@datetime' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) # Simpler to grab the amount and name spans separately, # then combine them into a string. ingredient_scopes = r_scope.select(ingredients_path) amount = ingredient_scopes.select( ingredients_amounts_path).extract() name = ingredient_scopes.select(ingredients_names_path).extract() ingredients = [ " ".join(ing).encode('utf-8') for ing in zip(amount, name) ] il.add_value('ingredients', ingredients) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="post"]' recipes_scopes = hxs.select(base_path) name_path = 'h2/a[@rel="bookmark"]/text()' image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]' description_path = 'div[@class="entry"]/text()' ingredients_path = 'div[@class="entry"]/p' datePublished = 'div[@class="date"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value( 'description', ''.join(r_scope.select(description_path).extract()).strip()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD: for ingredient in i_scope.select('text()'): ingredients.append(ingredient.extract().strip()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="primary_content"]""" recipes_scopes = hxs.select(base_path) name_path = '//h1[@class="fn"]/text()' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' time_path = './/p[@class="summary_data"][contains(text(), "Prep Time")]/text()' recipeYield_path = '//span[@class="yield"]/text()' ingredients_path = '*//*[@class="ingredient"]' datePublished_path = '//p[@id="mag_info"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value("source", self.source) il.add_value("name", r_scope.select(name_path).extract()) il.add_value("image", r_scope.select(image_path).extract()) il.add_value("url", r_scope.select(url_path).extract()) il.add_value("description", r_scope.select(description_path).extract()) # time isn't stored in semantic markup on this site, which # makes it a pretty big disaster. ickiness ahead time_str = "".join(r_scope.select(time_path).extract()) if time_str.strip(): prep_pattern = "\s?Prep Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)" prep_time_re = re.match(prep_pattern, time_str, re.I) if prep_time_re: il.add_value("prepTime", prep_time_re.group(1)) cook_pattern = ".+\s?Cook Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)" cook_time_re = re.match(cook_pattern, time_str, re.I) if cook_time_re: il.add_value("cookTime", cook_time_re.group(1)) il.add_value("recipeYield", r_scope.select(recipeYield_path).extract()) # the ingredients are pretty well formatted here, but we do need # to trim some trailing whitespace ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select("text()").extract() ingredient = "".join(ingredient) ingredients.append(ingredient) il.add_value("ingredients", ingredients) # Date Published is formatted as [Category] | MMM YYYY # Split this into a tuple on the | and keep the last part datePublished = r_scope.select(datePublished_path).extract() datePublished = "".join(datePublished).partition("|")[2] il.add_value("datePublished", datePublished) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@id="recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="row page_title clearfix"]/h2/text()' description_path = '//*[@class="entry"]/p//text()' image_path = '//*[@class="featured_image"]/img[@class="image"]/@src' recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()' ingredients_path = '//*[@class="ingredients"]/tr' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # this gives us a list of TRs ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] # iterate over each TR scope and extract out the TDs + combine # the HTML will stripped in the pipeline for i_scope in ingredient_scopes: ingr_row = i_scope.select('td').extract() ingredient_str = " ".join(ingr_row).strip() ingredients.append(ingredient_str) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="primary_content"]""" recipes_scopes = hxs.select(base_path) name_path = '//h1[@class="fn"]/text()' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' time_path = './/p[@class="summary_data"][contains(text(), "Prep Time")]/text()' recipeYield_path = '//span[@class="yield"]/text()' ingredients_path = '*//*[@class="ingredient"]' datePublished_path = '//p[@id="mag_info"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) # time isn't stored in semantic markup on this site, which # makes it a pretty big disaster. ickiness ahead time_str = "".join(r_scope.select(time_path).extract()) if (time_str.strip()): prep_pattern = '\s?Prep Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)' prep_time_re = re.match(prep_pattern, time_str, re.I) if (prep_time_re): il.add_value('prepTime', prep_time_re.group(1)) cook_pattern = '.+\s?Cook Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)' cook_time_re = re.match(cook_pattern, time_str, re.I) if (cook_time_re): il.add_value('cookTime', cook_time_re.group(1)) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # the ingredients are pretty well formatted here, but we do need # to trim some trailing whitespace ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('text()').extract() ingredient = "".join(ingredient) ingredients.append(ingredient) il.add_value('ingredients', ingredients) # Date Published is formatted as [Category] | MMM YYYY # Split this into a tuple on the | and keep the last part datePublished = r_scope.select(datePublished_path).extract() datePublished = "".join(datePublished).partition("|")[2] il.add_value('datePublished', datePublished) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="summary"]/p/text()' image_path = '//p[1]/span/img/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = './/*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = './/*[@class="ingredient"]/p/text()' #same formatting as forthelovecooking, so kind of odd. #datePublished = 'TODO' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match(ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) #il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//article[@itemtype="http://data-vocabulary.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//h1[@itemprop="name"]/text()' description_path = '//meta[@name="description"]/@content' image_path = '//img[@itemprop="photo"]/@src' prepTime_path = '//span[@itemprop="prepTime"]/text()' cookTime_path = '//span[@itemprop="cookTime"]/text()' recipeYield_path = '//span[@itemprop="yield"]/text()' ingredients_path = '//li[@itemprop="ingredient"]' ingredients_amounts_path = './span[@itemprop="amount"]/span/text()' ingredients_names_path = './span[@itemprop="name"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # then combine them into a string. ingredient_scopes = r_scope.select(ingredients_path) amount = ingredient_scopes.select(ingredients_amounts_path).extract() name = ingredient_scopes.select(ingredients_names_path).extract() ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)] il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="post hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="title fn"]/text()' image_path = '//*[@class="photo"]/@src' ingredients_path = '//ul[@class="ingredient_list"]/li/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//body""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//span[@itemprop="yield"]/text()' description_path = '//meta[@name="description"]/@content' image_path = '//img[@class="recipe_image"]/@src' cookTime_path = '//time[@itemprop="totalTime"]' prepTime_path = '//time[@itemprop="activeTime"]' # There are some inconsistencies in the format of ingredients, # so we'll scrape both: if the first yields nothing, we go # with the second. ingredients_path = '//span[@itemprop="ingredient"]' ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', parse_iso_date(r_scope.select(prepTime_path))) il.add_value('cookTime', parse_iso_date(r_scope.select(cookTime_path))) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('node()/text() | text()').extract() ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8')) # Again, checking to see if our first XPath was a failure. if not ingredients: ingredient_scopes = r_scope.select(ingredients_alt_path) for i_scope in ingredient_scopes: ingredients.append(i_scope.extract().strip().encode('utf-8')) il.add_value('ingredients', ingredients) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//blockquote[@class="recipe"]""" recipes_scopes = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' url_path = '//meta[@property="og:url"]/@content' description_path = '//meta[@property="og:description"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@content' cookTime_path = '*//*[@itemprop="cookTime"]/@content' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' ingredients_path = '*//*[@itemprop="ingredients"]' datePublished = '//p[@class="date"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ind = i_scope.extract() ind = ind.strip() ingredients.append("%s " % (ind)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@id="recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="row page_title clearfix"]/h2/text()' description_path = '//*[@class="entry"]/p//text()' image_path = '//*[@class="featured_image"]/img[@class="image"]/@src' recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()' ingredients_path = '*//*[@class="ingredients"]' #the site only offers total time, so prep and cook is combined #prepTime_path = '' # timezone warning, that is over my head at this point #cookTime_path = '//*[@class="cook_time"]' # datePublished = 'TODO' not available recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) #il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # il.add_value('prepTime', r_scope.select(prepTime_path).extract()) #il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) #il.add_value('ingredients', r_scope.select(ingredients_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('//td/strong').extract() name = i_scope.select('//*[@class="ingredients"]/tbody/tr/td/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) # il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//article[@class="hrecipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1/text()' recipeYield_path = '//span[@class="info yield"]/text()' image_path = '//section[@class="content-unit"]/img/@src' prepTime_path = '//span[@class="info preptime"]/text()' cookTime_path = '//span[@class="info duration"]/text()' ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()' datePublished = '//footer/time/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredients.append(i_scope.extract()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="zlrecipe-innerdiv"]""" recipes_scopes = hxs.select(base_path) name_path = '*//*[@itemprop="name"]/text()' url_path = '//link[@rel="canonical"]/@href' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@content' cookTime_path = '*//*[@itemprop="cookTime"]/@content' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' ingredients_path = '*//*[@itemprop="ingredients"]' datePublished = '//*[@class="time_stamp_month"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ind = i_scope.select('.//text()').extract() ingredients.append(''.join(ind).strip()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="blq-main"]""" recipes_scopes = hxs.select(base_path) name_path = '//h1/text()' description_path = '//div[@id="description"]//span[@class="summary"]/text()' image_path = '//img[@id="food-image"]/@src' prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title' cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title' recipeYield_path = '//h3[@class="yield"]/text()' ingredients_path = '//p[@class="ingredient"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('text()[1]').extract() name = i_scope.select('a/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//div[@class="recipe-details"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//label[@for="set_servings"]/input/@value' description_path = '//span[@itemprop="summary"]/p/text()' image_path = '//img[@class="the_recipe_image"]/@src' cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime' prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime' ingredients_path = '//span[@itemprop="ingredient"]' ingredients_amounts_path = './span[@itemprop="amount"]/text()' ingredients_names_path = './span[@itemprop="amount"]/text()' datePublished_path = '//span[@itemprop="published"]/@datetime' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) # Simpler to grab the amount and name spans separately, # then combine them into a string. ingredient_scopes = r_scope.select(ingredients_path) amount = ingredient_scopes.select(ingredients_amounts_path).extract() name = ingredient_scopes.select(ingredients_names_path).extract() ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)] il.add_value('ingredients', ingredients) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="recipe"]""" recipes_scopes = hxs.select(base_path) name_path = 'h1/text()' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title' cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title' # super inconsistent in how the yield is formatted recipeYield_path = "|".join([ '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i', '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i', '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em', '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em', '//div[@id="recipe"][starts-with(p,"Makes")]/p', '//div[@id="recipe"][starts-with(p,"Serves")]/p', ]) ingredients_path = 'blockquote/*' datePublished = '//span[@class="published"]/span[@class="value-title"]/@title' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="post hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="title fn"]/text()' # little iffy on how to do this one #description_path = 'TODO' image_path = '//*[@class="photo"]/@src' # both cook and prep time not available #prepTime_path = 'TODO' #cookTime_path = 'TODO' # check on diff sites recipeYield_path = '//blockquote/p[2]/text()' #ingredients_path = '//*[@class="ingredient_list"]' ingredients_path = '//ul[@class="ingredient_list"]/li/text()' datePublished = 'normalize-space(//*[@class="postmeta"]/text())' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) #il.add_value('description', r_scope.select(description_path).extract()) #il.add_value('prepTime', r_scope.select(prepTime_path).extract()) #il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: pass il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//*[contains(@class,'hrecipe')]""" recipes_scopes = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@name="description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()' ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]' datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]' # skip HTML comment recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredients_scope = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredients_scope: quantity = i_scope.select('span[@class="quantity"]/text()').extract() name = i_scope.select('span[@class="name"]/text()').extract() quantity = "".join(quantity).strip() name = "".join(name).strip() ingredients.append("%s %s" % (quantity, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="post"]' recipes_scopes = hxs.select(base_path) name_path = 'h2/a[@rel="bookmark"]/text()' image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]' description_path = 'div[@class="entry"]/text()' ingredients_path = 'div[@class="entry"]/p' datePublished = 'div[@class="date"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', ''.join(r_scope.select(description_path).extract()).strip()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD: for ingredient in i_scope.select('text()'): ingredients.append(ingredient.extract().strip()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="innerrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '*//h2[@class="fn"]/text()' image_path = '*//img[@class="photo"]/@src' prepTime_path = '*//span[@class="preptime"]/text()' cookTime_path = '*//span[@class="cooktime"]/text()' totalTime_path = '*//span[@class="duration"]/text()' recipeYield_path = '*//span[@class="yield"]/text()' datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]' ingredients_path = '*//*[@class="ingredient"]/p' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('totalTime', r_scope.select(totalTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('text()').extract() ingredient = "".join(ingredient) ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@id="recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="row page_title clearfix"]/h2/text()' description_path = '//*[@class="entry"]/p//text()' image_path = '//*[@class="featured_image"]/img[@class="image"]/@src' recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()' ingredients_path = '*//*[@class="ingredients"]' #the site only offers total time, so prep and cook is combined #prepTime_path = '' # timezone warning, that is over my head at this point #cookTime_path = '//*[@class="cook_time"]' # datePublished = 'TODO' not available recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) #il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # il.add_value('prepTime', r_scope.select(prepTime_path).extract()) #il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) #il.add_value('ingredients', r_scope.select(ingredients_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('//td/strong').extract() name = i_scope.select( '//*[@class="ingredients"]/tbody/tr/td/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) # il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="blq-main"]""" recipes_scopes = hxs.select(base_path) name_path = "//h1/text()" description_path = '//div[@id="description"]//span[@class="summary"]/text()' image_path = '//img[@id="food-image"]/@src' prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title' cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title' recipeYield_path = '//h3[@class="yield"]/text()' ingredients_path = '//p[@class="ingredient"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value("source", self.source) il.add_value("name", r_scope.select(name_path).extract()) il.add_value("image", r_scope.select(image_path).extract()) il.add_value("url", response.url) il.add_value("description", r_scope.select(description_path).extract()) il.add_value("prepTime", r_scope.select(prepTime_path).extract()) il.add_value("cookTime", r_scope.select(cookTime_path).extract()) il.add_value("recipeYield", r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select("text()[1]").extract() name = i_scope.select("a/text()").extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value("ingredients", ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//*[contains(@class,'hrecipe')]""" recipes_scopes = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@name="description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()' ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]' datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]' # skip HTML comment recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredients_scope = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredients_scope: quantity = i_scope.select( 'span[@class="quantity"]/text()').extract() name = i_scope.select('span[@class="name"]/text()').extract() quantity = "".join(quantity).strip() name = "".join(name).strip() ingredients.append("%s %s" % (quantity, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@itemtype="http://schema.org/Recipe"]' recipes_scope = hxs.select(base_path) ingredients_path = '//li[@itemprop="ingredients"]/text()' image_path = '(//div[@class="entry"]//img/@src)[1]' name_path = '//div[@itemprop="name"]/text()' url_path = '//h2[@class="title"]/a/@href' yield_path = '//span[@itemprop="servingSize"]/text()' total_time_path = '//span[@itemprop="totalTime"]/@content' recipes = [] for recipe_scope in recipes_scope: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('image', recipe_scope.select(image_path).extract()) il.add_value('name', recipe_scope.select(name_path).extract()) il.add_value('url', recipe_scope.select(url_path).extract()) ingredients = [] ingredient_scopes = recipe_scope.select(ingredients_path) for ingredient_scope in ingredient_scopes: ingredient = ingredient_scope.extract().strip() if (ingredient): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('recipeYield', recipe_scope.select(yield_path).extract()) il.add_value('totalTime', recipe_scope.select(total_time_path).extract()) recipes.append(il.load_item()) return recipes