Beispiel #1
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Beispiel #2
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Beispiel #4
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
  def parse_item(self, response):
    hxs = HtmlXPathSelector(response)
    raw_recipes = self.parse_recipes(hxs, {'source': self.source, 'url': response.url})
    for recipe in raw_recipes:
      if 'photo' in recipe:
        photo_url = flatten(recipe['photo'])
        if photo_url.startswith('//'):
          photo_url = 'http:' + photo_url
        recipe['photo'] = photo_url

      if 'image' in recipe:
        photo_url = flatten(recipe['image'])
        if photo_url.startswith('//'):
          photo_url = 'http:' + photo_url
        recipe['image'] = photo_url

    return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Beispiel #6
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        raw_recipes = self.parse_recipes(hxs, {
            'source': self.source,
            'url': response.url
        })
        for recipe in raw_recipes:
            if 'photo' in recipe:
                photo_url = flatten(recipe['photo'])
                if photo_url.startswith('//'):
                    photo_url = 'http:' + photo_url
                recipe['photo'] = photo_url

            if 'image' in recipe:
                photo_url = flatten(recipe['image'])
                if photo_url.startswith('//'):
                    photo_url = 'http:' + photo_url
                recipe['image'] = photo_url

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]