def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # blockquote containing the recipe has multiple classes
        # specify that it must contain the class hrecipe
        base_path = """//blockquote[contains(concat(' ', normalize-space(@class), ' '), ' hrecipe ')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        totalTime_path = '//*[@class="duration"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text() | //*[@class="ingredient"]/span/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('url', response.url)
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            recipes.append(il.load_item())

        return recipes
Beispiel #2
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract()

        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        if raw_recipes:
            # schema.org.  Yay!
            for recipe in raw_recipes:
                recipe['image'] = image_path

            return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
        else:
            # not schema.org.  Boo!
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('url', response.url)
            il.add_value('image', image_path)

            name_path = '//*[@class="post-title"]/h1/text()'
            il.add_value('name', hxs.select(name_path).extract())
            # maybe it's in the P's
            for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'):
                if is_ingredient_container(p):
                    il.add_value('ingredients', p.select('text()').extract())
            # or maybe it's in the LI's
            for li in hxs.select('//*[@class="span9"]//ul/li'):
                if is_ingredient_container(li):
                    il.add_value('ingredients', li.select('text()').extract())
            # or maybe it's in these other LI's
            for li in hxs.select('//li[@class="ingredient"]/text()'):
                il.add_value('ingredients', li.extract())
            return il.load_item()
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        image_path = '//*[@class="photo"]/@src'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        image_path = '//*[@class="photo"]/@src'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@class="recipe-details"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//label[@for="set_servings"]/input/@value'
        description_path = '//span[@itemprop="summary"]/p/text()'
        image_path = '//img[@class="the_recipe_image"]/@src'
        cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime'
        prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime'
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/text()'
        ingredients_names_path = './span[@itemprop="amount"]/text()'
        datePublished_path = '//span[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('url', response.url)
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('datePublished', r_scope.select(datePublished_path).extract())

            # Simpler to grab the amount and name spans separately,
            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]

            il.add_value('ingredients', ingredients)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the
        # recipe info
        base_path = """//div[@itemtype="http://schema.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()'
        description_path = '//div[@itemprop="description"]/text()'
        image_path = '//img[1]/@src'
        prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title'
        cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title'
        recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
        ingredients_path = '//li[@itemprop="ingredients"]/text()'
        datePublished = '//abbr[@class="published"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            # There's a bunch of images for each recipe, so we just
            # grab the first.
            il.add_value('image', r_scope.select(image_path).extract()[1])
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())
            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
Beispiel #7
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//article[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1/text()'
        recipeYield_path = '//span[@class="info yield"]/text()'
        image_path = '//section[@class="content-unit"]/img/@src'
        prepTime_path = '//span[@class="info preptime"]/text()'
        cookTime_path = '//span[@class="info duration"]/text()'
        ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()'
        datePublished = '//footer/time/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item_alt1(self, response):
        hxs = HtmlXPathSelector(response)
        base_path = """//blockquote"""
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        # just grab the first image we can find
        image_path = '//div[@class="post"]/p[1]/img/@src'
        ypc_path = './/p/text()[starts-with(.,"Yields")]'
        # ingredients always seems to follow the ypc block
        ingredients_path = './/p[starts-with(text(),"Yields")]/following-sibling::p[1]'
        datePublished_path = '//meta[@property="article:published_time"]/@content'
        dateModified_path = '//meta[@property="article:modified_time"]/@content'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            # might be able to make this bit more robust, which would probably
            # let us hit more recipes on this site. Not terribly motivated, tho
            ypc_str = "".join(r_scope.select(ypc_path).extract())
            yield_match = re.match(r'Yields?:?\s([^|]+)', ypc_str, re.I)
            prep_match = re.match(r'.+Prep(?: Time)?:?\s([^|]+)', ypc_str, re.I)
            cook_match = re.match(r'.+Cook Time:?\s([^|]+)', ypc_str, re.I)

            if yield_match:
                il.add_value('recipeYield', yield_match.group(1))
            if prep_match:
                il.add_value('prepTime', prep_match.group(1))
            if cook_match:
                il.add_value('cookTime', cook_match.group(1))

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            il.add_value('datePublished', r_scope.select(datePublished_path).extract())
            il.add_value('dateModified', r_scope.select(dateModified_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
Beispiel #9
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="recipe-title"]/text()'
        #  not sure how to get the description consistently on this one.
        #description_path = 'TODO'
        image_path = '//*[@itemprop="image"]/@src'
        prepTime_path = '//*[@class="prep-time tooltip-element"]/number()'
        cookTime_path = '//*[@class="total-time tooltip-element"]/text()'
        recipeYield_path = '//*[@itemprop="recipeYield"]/text()'
        #may have to make ingredients more generic
        ingredients_path = '//*[@class="ingredients-list"]/ul'
        datePublished = '//*[@class="date published time"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select(
                    '//*[@class="ingredient-n"]/text()').extract()
                ingredient_unit = i_scope.select(
                    '*//*[@class="ingredient-unit"]/text()').extract()
                name = i_scope.select(
                    '//*[@class="ingredient-name"]/text()').extract()
                amount = "".join(amount).strip()
                ingredient_unit = "".join(ingredient_unit).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, ingredient_unit, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {"url": response.url, "source": self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value("image", select_class(hxs, "post_image").select("@src").extract())
        loader.add_value("description", hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value("name", select_class(hxs, "entry-title").select("text()").extract())
        return [loader.load_item()]
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value('image', select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value('description', hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value('name', select_class(hxs, 'entry-title').select('text()').extract())
        return loader.load_item()
Beispiel #12
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//*[@id="container"]/*[@class="onepage"]/div[1]/div[@class="content"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@class="title"]/text() | //*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[strong]/strong/text()'
        image_path = '//*[@class="content"]/p[1]/img[contains(@class, "size-full")]/@src'
        recipeYield_path = '//*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[em and strong]/em/text()'
        datePublished = '//*[@class="phn-date"]/a[@rel="author"]/following-sibling::text()'

        # This site contains Ingredients and Garnishes, both "lists" are inside a <p> and separated
        # using <br>s. Also, we skip the <p> containing "EVENT VENUE PARTY SIZE TYPE MENU" by
        # grabbing <p>s that do not have <strong>, <a>, or <img> child elements
        ingredients_path = '//*[@class="content"]/p[not(strong or a or img) and br]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # date returns something like this: "ON SATURDAY NOV 28TH, 2009 |"
            date = r_scope.select(datePublished).extract()
            if len(date) > 0:
                date = date[0].replace('on', '', 1).replace('|', '').strip()
                il.add_value('datePublished', date)

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//article[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1/text()'
        recipeYield_path = '//span[@class="info yield"]/text()'
        image_path = '//section[@class="content-unit"]/img/@src'
        prepTime_path = '//span[@class="info preptime"]/text()'
        cookTime_path = '//span[@class="info duration"]/text()'
        ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()'
        datePublished = '//footer/time/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = """//div[@itemtype="http://schema.org/Recipe"]"""
        recipes_scope = hxs.select(base_path)

        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        name_path = '//meta[@property="og:title"]/@content'
        url_path = '//meta[@property="og:url"]/@content'

        date_published_path = '//div[@class="metabar-pad"]//time/@datetime'
        author_path = '//span[@itemprop="author"]/text()'

        ingredients_path = '//li[@itemprop="ingredients"]/text()'

        cook_time_path = './/time[@itemprop="cookTime"]//*[starts-with(@title, "PT")]/@title | .//time[@itemprop="cookTime"]/@datetime'
        prep_time_path = './/time[@itemprop="prepTime"]//*[starts-with(@title, "PT")]/@title | .//time[@itemprop="prepTime"]/@datetime'
        category_path = '//span[@itemprop="recipeCategory"]/text()'
        yield_path = '//span[@itemprop="recipeYield"]/text()'
        total_time_path = './/time[@itemprop="totalTime"]//*[starts-with(@title, "PT")]/@title | .//time[@itemprop="totalTime"]/@datetime'

        recipes = []
        for recipe_scope in recipes_scope:

            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('description', recipe_scope.select(description_path).extract())
            il.add_value('image', recipe_scope.select(image_path).extract())
            il.add_value('name', recipe_scope.select(name_path).extract())
            il.add_value('url', recipe_scope.select(url_path).extract())

            il.add_value('datePublished', recipe_scope.select(date_published_path).extract())
            il.add_value('creator', recipe_scope.select(author_path).extract())

            ingredients = []
            ingredient_scopes = recipe_scope.select(ingredients_path)
            for ingredient_scope in ingredient_scopes:
                ingredient = ingredient_scope.extract().strip()
                if (ingredient):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('cookTime', recipe_scope.select(cook_time_path).extract())
            il.add_value('prepTime', recipe_scope.select(prep_time_path).extract())
            il.add_value('recipeCategory', recipe_scope.select(category_path).extract())
            il.add_value('recipeYield', recipe_scope.select(yield_path).extract())
            il.add_value('totalTime', recipe_scope.select(total_time_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the
        # recipe info
        base_path = """//div[@itemtype="http://schema.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()'
        description_path = '//div[@itemprop="description"]/text()'
        image_path = '//img[1]/@src'
        prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title'
        cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title'
        recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
        ingredients_path = '//li[@itemprop="ingredients"]/text()'
        datePublished = '//abbr[@class="published"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            # There's a bunch of images for each recipe, so we just
            # grab the first.
            il.add_value('image', r_scope.select(image_path).extract()[1])
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            il.add_value('datePublished', r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        #desription is pretty odd on this site.
        #description_path = 'TODO'
        image_path = '//div/p[1]//img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'
        #the formatting is odd, will need to learn more xpath to be able to select, just date
        datePublished = '//*[@class="postmeta"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        if '/ingredients/' in response.url or '/category/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="blog"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/a[@rel="bookmark"]/text()'
        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]//text()[normalize-space()]'
        ingredients_path = './/*[@class="ingredient"]'
        datePublished = '//div[@class="blurb"]/strong/text()[1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('recipeYield',
                         ' '.join(r_scope.select(recipeYield_path).extract()))

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for ingredient_node in ingredient_scopes:
                ingredient = [
                    i.strip() for i in ingredient_node.select(
                        './/text()[normalize-space()]').extract()
                ]
                ingredients.append(' '.join(ingredient))

            il.add_value('ingredients', ingredients)

            datePublished = r_scope.select(datePublished).extract()[0]
            il.add_value(
                'datePublished',
                datePublished.replace('Posted on', '').replace('in',
                                                               '').strip())

            recipes.append(il.load_item())

        return recipes
Beispiel #18
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@class="fn"]/text()'
        #long descriptions used in website, also the description doesn't appear..
        description_path = '//*[@class="format_text entry-content jpibfi_container"]/p/text()'
        #the end image url contains dimensions 150x150, not sure how to remove.
        image_path = '//*[@class="photo"]/@src'
        #prepTime_path = 'TODO'   None given
        #cookTime_path = 'TODO'   None given
        #recipeYield_path = 'TODO'None given
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        datePublished = '//span[@class="published"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            #il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            #il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #19
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//span[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@class="content"]/header/h1[@class="fn"]/text()'
        description_path = '//article[@class="recipe_description"]//text()'
        image_path = '//div[@class="recipe_image_main"]/p/img/@src'
        recipeYield_path = '//div[@class="recipe_meta"]/p/span[contains(@class,"yield")]/text()'
        ingredients_path = '//article[@class="ingredients"]//ul//li/p[@class="ingredient"]/span[@class="value"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', urljoin(response.url, r_scope.select(image_path).extract().pop(0)))
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            # prepTime not available
            il.add_value('prepTime', None)
            # cookTime not available
            il.add_value('cookTime', None)
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # datePublished not available
            il.add_value('datePublished', None)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//p[1]/span/img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = './/*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        #same formatting as forthelovecooking, so kind of odd.
        #datePublished = 'TODO'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            #il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #21
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '*//*[@class="ingredients"]'
        #the site only offers total time, so prep and cook is combined
        #prepTime_path = ''
        # timezone warning, that is over my head at this point
        #cookTime_path = '//*[@class="cook_time"]'
        # datePublished = 'TODO' not available

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            #il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            #il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//td/strong').extract()
                name = i_scope.select(
                    '//*[@class="ingredients"]/tbody/tr/td/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)
            # il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="recipe-title"]/text()'
        #  not sure how to get the description consistently on this one.
        #description_path = 'TODO'
        image_path = '//*[@itemprop="image"]/@src'
        prepTime_path = '//*[@class="prep-time tooltip-element"]/number()'
        cookTime_path = '//*[@class="total-time tooltip-element"]/text()'
        recipeYield_path = '//*[@itemprop="recipeYield"]/text()'
        #may have to make ingredients more generic
        ingredients_path = '//*[@class="ingredients-list"]/ul'
        datePublished = '//*[@class="date published time"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//*[@class="ingredient-n"]/text()').extract()
                ingredient_unit = i_scope.select('*//*[@class="ingredient-unit"]/text()').extract()
                name = i_scope.select('//*[@class="ingredient-name"]/text()').extract()
                amount = "".join(amount).strip()
                ingredient_unit = "".join(ingredient_unit).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, ingredient_unit, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="recipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title'
        cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title'

        # super inconsistent in how the yield is formatted
        recipeYield_path = "|".join([
            '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i',
            '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i',
            '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em',
            '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em',
            '//div[@id="recipe"][starts-with(p,"Makes")]/p',
            '//div[@id="recipe"][starts-with(p,"Serves")]/p',
        ])
        ingredients_path = 'blockquote/*'
        datePublished = '//span[@class="published"]/span[@class="value-title"]/@title'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        #desription is pretty odd on this site.
        #description_path = 'TODO'
        image_path = '//div/p[1]//img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'
        #the formatting is odd, will need to learn more xpath to be able to select, just date
        datePublished = '//*[@class="postmeta"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@class="fn"]/text()'
        #long descriptions used in website, also the description doesn't appear..
        description_path = '//*[@class="format_text entry-content jpibfi_container"]/p/text()'
        #the end image url contains dimensions 150x150, not sure how to remove.
        image_path = '//*[@class="photo"]/@src'
        #prepTime_path = 'TODO'   None given
        #cookTime_path = 'TODO'   None given
        #recipeYield_path = 'TODO'None given
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        datePublished = '//span[@class="published"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            #il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            #il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//p[1]/span/img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = './/*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        #same formatting as forthelovecooking, so kind of odd.
        #datePublished = 'TODO'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            #il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '*//*[@class="ingredients"]'
        #the site only offers total time, so prep and cook is combined
        #prepTime_path = ''
        # timezone warning, that is over my head at this point
        #cookTime_path = '//*[@class="cook_time"]'
        # datePublished = 'TODO' not available

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            #il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            # il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            #il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//td/strong').extract()
                name = i_scope.select('//*[@class="ingredients"]/tbody/tr/td/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)
            # il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #28
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="innerrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '*//h2[@class="fn"]/text()'
        image_path = '*//img[@class="photo"]/@src'
        prepTime_path = '*//span[@class="preptime"]/text()'
        cookTime_path = '*//span[@class="cooktime"]/text()'
        totalTime_path = '*//span[@class="duration"]/text()'
        recipeYield_path = '*//span[@class="yield"]/text()'
        datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]'
        ingredients_path = '*//*[@class="ingredient"]/p'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('text()').extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        #  little iffy on how to do this one
        #description_path = 'TODO'
        image_path = '//*[@class="photo"]/@src'
        #  both cook and prep time not available
        #prepTime_path = 'TODO'
        #cookTime_path = 'TODO'
        #  check on diff sites
        recipeYield_path = '//blockquote/p[2]/text()'
        #ingredients_path = '//*[@class="ingredient_list"]'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'
        datePublished = 'normalize-space(//*[@class="postmeta"]/text())'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            #il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                pass
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #30
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # site has many recipes missing the semantmic markup, but not worth
        # pursuing those IMHO. use hrecipe
        base_path = """//*[@class="hrecipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'

        # get the date from rest of page, not under hrecipe
        datePublished_path = '//*[@class="date"][1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@class="amount"]/text()').extract()
                name = i_scope.select('*[@class="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #31
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value(
            'image',
            select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value(
            'description',
            hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value(
            'name',
            select_class(hxs, 'entry-title').select('text()').extract())
        return [loader.load_item()]
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="recipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title'
        cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title'

        # super inconsistent in how the yield is formatted
        recipeYield_path = "|".join([
                                    '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i',
                                    '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i',
                                    '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em',
                                    '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em',
                                    '//div[@id="recipe"][starts-with(p,"Makes")]/p',
                                    '//div[@id="recipe"][starts-with(p,"Serves")]/p',
                                    ])
        ingredients_path = 'blockquote/*'
        datePublished = '//span[@class="published"]/span[@class="value-title"]/@title'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #33
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
Beispiel #34
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = 'TODO'

        recipes_scopes = hxs.select(base_path)

        name_path = 'TODO'
        description_path = 'TODO'
        image_path = 'TODO'
        prepTime_path = 'TODO'
        cookTime_path = 'TODO'
        recipeYield_path = 'TODO'
        ingredients_path = 'TODO'
        datePublished = 'TODO'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                pass
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//div[@id="zlrecipe-innerdiv"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '*//*[@itemprop="name"]/text()'
        url_path = '//link[@rel="canonical"]/@href'
        image_path = '//meta[@property="og:image"][1]/@content'

        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//*[@class="time_stamp_month"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.select('.//text()').extract()
                ingredients.append(''.join(ind).strip())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #36
0
def clean_item(old_dict):
    # copy this so we have an unmodified version
    source_dict = dict(old_dict)
    # remove ts and _id fields from what we pass to loader
    del source_dict['ts']
    del source_dict['_id']

    if VERBOSE:
        print "Examining '%s' from '%s' (%s)..." % (
            old_dict['name'], old_dict['source'], old_dict['_id'])

    loader = RecipeItemLoader(RecipeItem())
    for k, v in source_dict.iteritems():
        loader = set_value(loader, k, v)

    new_item = loader.load_item()
    return new_item, source_dict
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # site has many recipes missing the semantmic markup, but not worth
        # pursuing those IMHO. use hrecipe
        base_path = """//*[@class="hrecipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'

        # get the date from rest of page, not under hrecipe
        datePublished_path = '//*[@class="date"][1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@class="amount"]/text()').extract()
                name = i_scope.select('*[@class="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="innerrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '*//h2[@class="fn"]/text()'
        image_path = '*//img[@class="photo"]/@src'
        prepTime_path = '*//span[@class="preptime"]/text()'
        cookTime_path = '*//span[@class="cooktime"]/text()'
        totalTime_path = '*//span[@class="duration"]/text()'
        recipeYield_path = '*//span[@class="yield"]/text()'
        datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]'
        ingredients_path = '*//*[@class="ingredient"]/p'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", response.url)

            il.add_value("prepTime", r_scope.select(prepTime_path).extract())
            il.add_value("cookTime", r_scope.select(cookTime_path).extract())
            il.add_value("totalTime", r_scope.select(totalTime_path).extract())
            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select("text()").extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value("ingredients", ingredients)

            il.add_value("datePublished", r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #39
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="blq-main"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1/text()'
        description_path = '//div[@id="description"]//span[@class="summary"]/text()'
        image_path = '//img[@id="food-image"]/@src'
        prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title'
        cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title'
        recipeYield_path = '//h3[@class="yield"]/text()'
        ingredients_path = '//p[@class="ingredient"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('text()[1]').extract()
                name = i_scope.select('a/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
Beispiel #40
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@id="content"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/span[@class="item"]/h2[@class="fn"]/text()'
        image_path = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' size-full ')][1]/@src"
        prepTime_path = './/span[@class="preptime"]/text()'
        cookTime_path = './/span[@class="cooktime"]/text()'
        recipeYield_path = './/span[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        if '/ingredients/' in response.url or '/category/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="blog"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/a[@rel="bookmark"]/text()'
        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]//text()[normalize-space()]'
        ingredients_path = './/*[@class="ingredient"]'
        datePublished = '//div[@class="blurb"]/strong/text()[1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('recipeYield', ' '.join(r_scope.select(recipeYield_path).extract()))

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for ingredient_node in ingredient_scopes:
                ingredient = [i.strip() for i in ingredient_node.select('.//text()[normalize-space()]').extract()]
                ingredients.append(' '.join(ingredient))

            il.add_value('ingredients', ingredients)

            datePublished = r_scope.select(datePublished).extract()[0]
            il.add_value('datePublished', datePublished.replace('Posted on', '').replace('in', '').strip())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//div[@id="zlrecipe-innerdiv"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '*//*[@itemprop="name"]/text()'
        url_path = '//link[@rel="canonical"]/@href'
        image_path = '//meta[@property="og:image"][1]/@content'

        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//*[@class="time_stamp_month"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.extract()
                ind = ind.strip()
                ingredients.append("%s " % (ind))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '//*[@class="ingredients"]/tr'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            # this gives us a list of TRs
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []

            # iterate over each TR scope and extract out the TDs + combine
            # the HTML will stripped in the pipeline
            for i_scope in ingredient_scopes:
                ingr_row = i_scope.select('td').extract()
                ingredient_str = " ".join(ingr_row).strip()
                ingredients.append(ingredient_str)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
Beispiel #45
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(@class,'hrecipe')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@name="description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()'
        ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]'
        datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]'  # skip HTML comment

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredients_scope = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredients_scope:
                quantity = i_scope.select(
                    'span[@class="quantity"]/text()').extract()
                name = i_scope.select('span[@class="name"]/text()').extract()
                quantity = "".join(quantity).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (quantity, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
Beispiel #46
0
def clean_item(old_dict):
    # copy this so we have an unmodified version
    source_dict = dict(old_dict)
    # remove ts and _id fields from what we pass to loader
    del source_dict['ts']
    del source_dict['_id']

    if VERBOSE:
        print "Examining '%s' from '%s' (%s)..." % (old_dict['name'],
                                                    old_dict['source'],
                                                    old_dict['_id'])

    loader = RecipeItemLoader(RecipeItem())
    for k, v in source_dict.iteritems():
        loader = set_value(loader, k, v)

    new_item = loader.load_item()
    return new_item, source_dict
Beispiel #47
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//article[@itemtype="http://data-vocabulary.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@itemprop="name"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@itemprop="photo"]/@src'
        prepTime_path = '//span[@itemprop="prepTime"]/text()'
        cookTime_path = '//span[@itemprop="cookTime"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        ingredients_path = '//li[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/span/text()'
        ingredients_names_path = './span[@itemprop="name"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="post"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h2/a[@rel="bookmark"]/text()'
        image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]'
        description_path = 'div[@class="entry"]/text()'
        ingredients_path = 'div[@class="entry"]/p'
        datePublished = 'div[@class="date"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value(
                'description',
                ''.join(r_scope.select(description_path).extract()).strip())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD:
                    for ingredient in i_scope.select('text()'):
                        ingredients.append(ingredient.extract().strip())

            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//blockquote[@class="recipe"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'
        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//p[@class="date"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.extract()
                ind = ind.strip()
                ingredients.append("%s " % (ind))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = "TODO"

        recipes_scopes = hxs.select(base_path)

        name_path = "TODO"
        description_path = "TODO"
        image_path = "TODO"
        prepTime_path = "TODO"
        cookTime_path = "TODO"
        recipeYield_path = "TODO"
        ingredients_path = "TODO"
        datePublished = "TODO"

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", response.url)
            il.add_value("description", r_scope.select(description_path).extract())

            il.add_value("prepTime", r_scope.select(prepTime_path).extract())
            il.add_value("cookTime", r_scope.select(cookTime_path).extract())
            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                pass
            il.add_value("ingredients", ingredients)

            il.add_value("datePublished", r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="blq-main"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = "//h1/text()"
        description_path = '//div[@id="description"]//span[@class="summary"]/text()'
        image_path = '//img[@id="food-image"]/@src'
        prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title'
        cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title'
        recipeYield_path = '//h3[@class="yield"]/text()'
        ingredients_path = '//p[@class="ingredient"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", response.url)
            il.add_value("description", r_scope.select(description_path).extract())

            il.add_value("prepTime", r_scope.select(prepTime_path).extract())
            il.add_value("cookTime", r_scope.select(cookTime_path).extract())
            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select("text()[1]").extract()
                name = i_scope.select("a/text()").extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value("ingredients", ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '//*[@class="ingredients"]/tr'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # this gives us a list of TRs
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []

            # iterate over each TR scope and extract out the TDs + combine
            # the HTML will stripped in the pipeline
            for i_scope in ingredient_scopes:
                ingr_row = i_scope.select('td').extract()
                ingredient_str = " ".join(ingr_row).strip()
                ingredients.append(ingredient_str)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//html'

        recipes_scopes = hxs.select(base_path)

        name_path = './/p[@id="title"]/text()'
        description_path = "descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), ' '), ' summary ')]/text()"
        image_path = './/img[@class="photo"]/@src'
        prepTime_path = './/span[@class="preptime"]/text()'
        cookTime_path = './/span[@class="cooktime"]/text()'
        recipeYield_path = './/p[@id="ingr_header"]/span[@class="single_recipe_text"]/text()'
        ingredients_path = './/li[@class="ingredient"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@itemtype="http://schema.org/Recipe"]'
        recipes_scope = hxs.select(base_path)

        ingredients_path = '//li[@itemprop="ingredients"]/text()'

        image_path = '(//div[@class="entry"]//img/@src)[1]'
        name_path = '//div[@itemprop="name"]/text()'
        url_path = '//h2[@class="title"]/a/@href | //link[@rel="canonical"]/@href'

        yield_path = '//span[@itemprop="servingSize"]/text()'
        total_time_path = '//span[@itemprop="totalTime"]/@content'

        recipes = []
        for recipe_scope in recipes_scope:

            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('image', recipe_scope.select(image_path).extract())
            il.add_value('name', recipe_scope.select(name_path).extract())
            il.add_value('url', recipe_scope.select(url_path).extract())

            ingredients = []
            ingredient_scopes = recipe_scope.select(ingredients_path)
            for ingredient_scope in ingredient_scopes:
                ingredient = ingredient_scope.extract().strip()
                if (ingredient):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('recipeYield',
                         recipe_scope.select(yield_path).extract())
            il.add_value('totalTime',
                         recipe_scope.select(total_time_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//blockquote[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes