def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@itemprop="name"]/text()' description_path = '//*[@itemprop="description"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' prepTime_path = '//*[@itemprop="prepTime"]' cookTime_path = '//*[@itemprop="cookTime"]' ingredients_path = '//*[@itemprop="ingredients"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', 'allrecipes') il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) prepTime = r_scope.select(prepTime_path) il.add_value('prepTime', parse_iso_date(prepTime)) cookTime = r_scope.select(cookTime_path) il.add_value('cookTime', parse_iso_date(cookTime)) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: components = i_scope.select('node()/text()').extract() ingredients.append(' '.join(components)) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@itemprop="name"]/text()' description_path = '//*[@itemprop="description"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' prepTime_path = '//*[@itemprop="prepTime"]' cookTime_path = '//*[@itemprop="cookTime"]' ingredients_path = '//*[@itemprop="ingredients"]' recipes = [] for r_scope in recipes_scopes: item = RecipeItem() item['source'] = 'allrecipes' item['name'] = r_scope.select(name_path).extract() item['image'] = r_scope.select(image_path).extract() item['url'] = r_scope.select(url_path).extract() item['description'] = r_scope.select(description_path).extract() prepTime = r_scope.select(prepTime_path) item['prepTime'] = parse_iso_date(prepTime) cookTime = r_scope.select(cookTime_path) item['cookTime'] = parse_iso_date(cookTime) item['recipeYield'] = r_scope.select(recipeYield_path).extract() print item ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: components = i_scope.select('node()/text()').extract() ingredients.append(' '.join(components)) item['ingredients'] = ingredients recipes.append(item) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//body""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//span[@itemprop="yield"]/text()' description_path = '//meta[@name="description"]/@content' image_path = '//img[@class="recipe_image"]/@src' cookTime_path = '//time[@itemprop="totalTime"]' prepTime_path = '//time[@itemprop="activeTime"]' # There are some inconsistencies in the format of ingredients, # so we'll scrape both: if the first yields nothing, we go # with the second. ingredients_path = '//span[@itemprop="ingredient"]' ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', parse_iso_date(r_scope.select(prepTime_path))) il.add_value('cookTime', parse_iso_date(r_scope.select(cookTime_path))) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('node()/text() | text()').extract() ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8')) # Again, checking to see if our first XPath was a failure. if not ingredients: ingredient_scopes = r_scope.select(ingredients_alt_path) for i_scope in ingredient_scopes: ingredients.append(i_scope.extract().strip().encode('utf-8')) il.add_value('ingredients', ingredients) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//body""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//span[@itemprop="yield"]/text()' description_path = '//meta[@name="description"]/@content' image_path = '//img[@class="recipe_image"]/@src' cookTime_path = '//time[@itemprop="totalTime"]' prepTime_path = '//time[@itemprop="activeTime"]' # There are some inconsistencies in the format of ingredients, # so we'll scrape both: if the first yields nothing, we go # with the second. ingredients_path = '//span[@itemprop="ingredient"]' ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem item = RecipeItem() item['source'] = self.source item['name'] = r_scope.select(name_path).extract() item['image'] = r_scope.select(image_path).extract() item['description'] = r_scope.select(description_path).extract() item['url'] = response.url item['prepTime'] = parse_iso_date(r_scope.select(prepTime_path)) item['cookTime'] = parse_iso_date(r_scope.select(cookTime_path)) item['recipeYield'] = r_scope.select(recipeYield_path).extract() ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('node()/text() | text()').extract() ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8')) # Again, checking to see if our first XPath was a failure. if not ingredients: ingredient_scopes = r_scope.select(ingredients_alt_path) for i_scope in ingredient_scopes: ingredients.append(i_scope.extract().strip().encode('utf-8')) item['ingredients'] = ingredients # stick this RecipeItem in the array of recipes we will return recipes.append(item) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes