Example #1
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            item = RecipeItem()
            item['source'] = 'allrecipes'
            item['name'] = r_scope.select(name_path).extract()
            item['image'] = r_scope.select(image_path).extract()
            item['url'] = r_scope.select(url_path).extract()
            item['description'] = r_scope.select(description_path).extract()

            prepTime = r_scope.select(prepTime_path)
            item['prepTime'] = parse_iso_date(prepTime)

            cookTime = r_scope.select(cookTime_path)
            item['cookTime'] = parse_iso_date(cookTime)
            item['recipeYield'] = r_scope.select(recipeYield_path).extract()
            print item

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            item['ingredients'] = ingredients

            recipes.append(item)

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
Example #4
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//body"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@class="recipe_image"]/@src'
        cookTime_path = '//time[@itemprop="totalTime"]'
        prepTime_path = '//time[@itemprop="activeTime"]'

        # There are some inconsistencies in the format of ingredients,
        # so we'll scrape both: if the first yields nothing, we go
        # with the second.
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('url', response.url)
            il.add_value('prepTime', parse_iso_date(r_scope.select(prepTime_path)))
            il.add_value('cookTime', parse_iso_date(r_scope.select(cookTime_path)))
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('node()/text() | text()').extract()
                ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8'))

            # Again, checking to see if our first XPath was a failure.
            if not ingredients:
                ingredient_scopes = r_scope.select(ingredients_alt_path)
                for i_scope in ingredient_scopes:
                    ingredients.append(i_scope.extract().strip().encode('utf-8'))

            il.add_value('ingredients', ingredients)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
Example #5
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//body"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@class="recipe_image"]/@src'
        cookTime_path = '//time[@itemprop="totalTime"]'
        prepTime_path = '//time[@itemprop="activeTime"]'

        # There are some inconsistencies in the format of ingredients,
        # so we'll scrape both: if the first yields nothing, we go
        # with the second.
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            item = RecipeItem()

            item['source'] = self.source

            item['name'] = r_scope.select(name_path).extract()
            item['image'] = r_scope.select(image_path).extract()
            item['description'] = r_scope.select(description_path).extract()
            item['url'] = response.url
            item['prepTime'] = parse_iso_date(r_scope.select(prepTime_path))
            item['cookTime'] = parse_iso_date(r_scope.select(cookTime_path))
            item['recipeYield'] = r_scope.select(recipeYield_path).extract()

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('node()/text() | text()').extract()
                ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8'))

            # Again, checking to see if our first XPath was a failure.
            if not ingredients:
                ingredient_scopes = r_scope.select(ingredients_alt_path)
                for i_scope in ingredient_scopes:
                    ingredients.append(i_scope.extract().strip().encode('utf-8'))

            item['ingredients'] = ingredients

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(item)

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes