def parse_item(self, response): # skip review pages, which are hard to distinguish from recipe pages # in the link extractor regex if response.url.endswith('/review'): return [] hxs = HtmlXPathSelector(response) raw_recipes = parse_recipes(hxs, {'source': self.source}) for recipe in raw_recipes: if 'photo' in recipe: recipe['photo'] = flatten(recipe['photo']) if 'image' in recipe: recipe['image'] = flatten(recipe['image']) return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
def parse_item(self, response): # skip review pages, which are hard to distinguish from recipe pages # in the link extractor regex if '/reviews/' in response.url: return [] hxs = HtmlXPathSelector(response) raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url}) for recipe in raw_recipes: if 'photo' in recipe: recipe['photo'] = flatten(recipe['photo']) recipe['photo'] = recipe['photo'].replace('_med.', '_lg.') if 'image' in recipe: recipe['image'] = flatten(recipe['image']) recipe['image'] = recipe['image'].replace('_med.', '_lg.') return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
def parse_item(self, response): hxs = HtmlXPathSelector(response) raw_recipes = self.parse_recipes(hxs, {'source': self.source, 'url': response.url}) for recipe in raw_recipes: if 'photo' in recipe: photo_url = flatten(recipe['photo']) if photo_url.startswith('//'): photo_url = 'http:' + photo_url recipe['photo'] = photo_url if 'image' in recipe: photo_url = flatten(recipe['image']) if photo_url.startswith('//'): photo_url = 'http:' + photo_url recipe['image'] = photo_url return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
def parse_item(self, response): hxs = HtmlXPathSelector(response) raw_recipes = self.parse_recipes(hxs, { 'source': self.source, 'url': response.url }) for recipe in raw_recipes: if 'photo' in recipe: photo_url = flatten(recipe['photo']) if photo_url.startswith('//'): photo_url = 'http:' + photo_url recipe['photo'] = photo_url if 'image' in recipe: photo_url = flatten(recipe['image']) if photo_url.startswith('//'): photo_url = 'http:' + photo_url recipe['image'] = photo_url return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]