Ejemplo n.º 1
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # output received parent element after extracting additional info
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items if available
        prod_count_holder = hxs.select(
            "//span[@id='productCount']/text()").extract()
        if prod_count_holder:
            item['nr_products'] = int(prod_count_holder[0].strip())
        # exract description if available
        desc_holder = hxs.select("//div[@id='catalogCopyBlock']")
        if desc_holder:
            item['description_title'] = desc_holder.select(
                "h2/text()").extract()[0]
            description_texts = desc_holder.select("p/text()").extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])
            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        yield item

        chapters = hxs.select("//li[@class='nav_cat_item_bold']")

        for chapter in chapters:

            #TODO: still includes some special categories (like "Coming Soon" in men)
            # exclude "Brands" chapter
            chapter_name = chapter.select("span/text()").extract()
            if not chapter_name or "brands" in chapter_name[0]:
                continue

            subcats = chapter.select("ul/li/a")
            for subcat in subcats:
                item = CategoryItem()
                text = subcat.select('text()').extract()[0]
                # if it starts with "Shop all", ignore it
                if re.match("Shop [aA]ll.*", text):
                    continue
                else:
                    item['text'] = text
                # remove unnecessary suffix from URL
                url = subcat.select('@href').extract()[0]
                m = re.match("(.*\?id=[0-9]+)&?.*", url)
                if m:
                    item['url'] = m.group(1)
                else:
                    item['url'] = url
                item['level'] = int(response.meta['level']) - 1
                item['parent_text'] = response.meta['parent']['text']
                item['parent_url'] = response.url

                #yield item

                yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                    'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
Ejemplo n.º 2
0
class WalmartCaSpider(BaseSpider):
    name = "walmartca"
    allowed_domains = ["walmart.ca"]
    start_urls = [
        "http://www.walmart.ca/en",
    ]

    def __init__(self, outfile=None):
        self.root_url = "http://www.walmart.ca"
        self.outfile = outfile

        # set flag that indicates that for this spider, nr of products for each catgory should be computed
        self.compute_nrproducts = True

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 1

        # keep crawled items represented by (url, parent_url, department_url) pairs
        # to eliminate duplicates
        # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled)
        self.crawled = []

        # last used category id, used for autoincrementing ids idenrifying categories
        self.id_count = 0

        # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page
        self.special_itemcount = {'value of the day': 2}

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

    # parse category page and extract description and number of products
    def parseCategory(self, response):

        # URLs like health.walmart.com don't have body_as_unicode and generate an exception
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError, e:
            self.log("Could not get response from " + response.url +
                     "; original exception: " + str(e) + "\n",
                     level=log.WARNING)
            return

        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))

        # set item count for special items (hardcoded in special_itemcount)
        if item['text'].lower() in self.special_itemcount:
            item['nr_products'] = self.special_itemcount[item['text'].lower()]

        # Extract subcategories if no product count found
        if 'nr_products' in item:
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item
Ejemplo n.º 3
0
    def parseCategory(self, response):

        # if we are getting blocked by captcha, solve and redirect back here
        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.parseCategory
            )  # meta of response will contain number of retries left if set
            return

        hxs = HtmlXPathSelector(response)

        # extract additional info for received parent and return it
        item = response.meta['item']

        # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url)
        if 'nr_products' not in item:
            prod_count_holder = hxs.select(
                "//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                prod_count = prod_count_holder[0]
                # extract number

                # for paged results: Showing ... out of ... Results
                m = re.match(".*\s*of\s+([0-9,]+)\s+Results\s*", prod_count)

                # for one page results: Showing ... Result(s)
                if not m:
                    m = re.match(".*\s+([0-9,]+)\s+Results?\s*", prod_count)

                if m:
                    item['nr_products'] = int(re.sub(",", "", m.group(1)))

        # extract description if available
        # only extracts descriptions that contain a h2. is that good?
        desc_holders = hxs.select(
            "//div[@class='unified_widget rcmBody'][descendant::h2][last()]")
        # select the one among these with the most text
        #TODO: another idea: check if the holder has a h2 item
        if desc_holders:
            maxsize = 0
            max_desc_holder = desc_holders[0]
            for desc_holder in desc_holders:
                size = len(" ".join(desc_holder.select(".//text()").extract()))

                if size > maxsize:
                    maxsize = size
                    max_desc_holder = desc_holder
            desc_holder = max_desc_holder
            desc_title = desc_holder.select("h2/text()").extract()
            if desc_title:
                item['description_title'] = desc_title[0].strip()

            description_texts = desc_holder.select(
                ".//text()[not(ancestor::h2)]").extract()

            # if the list is not empty and contains at least one non-whitespace item
            # if there is a description title or the description body is large enough
            size_threshold = 50
            if (description_texts
                    and reduce(lambda x, y: x or y,
                               [line.strip()
                                for line in description_texts])):  # and \
                #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                if desc_title:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, and no product count was found, add info from that url
        extra_category = self.find_matching_key(
            item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS)

        # crawl lower level categories
        if item['level'] > self.LEVEL_BARRIER:
            if extra_category:

                # collect number of products from this alternate URL
                # this will also extract subcategories and their count
                yield Request(
                    self.EXTRA_TOPLEVEL_CATEGORIES_URLS[extra_category],
                    callback=self.extractSubcategories,
                    meta={'item': item})

            else:
                # extract subcategories and their count for category even if not in extra_...
                yield Request(item['url'],
                              callback=self.extractSubcategories,
                              meta={'item': item})
        else:
            yield item
Ejemplo n.º 4
0
 def build_url(self, url):
     url = Utils.add_domain(url, self.BASE_URL)
     url = Utils.clean_url(url, ['#'])
     return url
Ejemplo n.º 5
0
    def parse_product_amazon(self, response):

        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']
        origin_url = self.results[origin_product_id]['origin_product'][
            'origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        for field in self.results[origin_product_id]['origin_product'].keys():
            item[field] = self.results[origin_product_id]['origin_product'][
                field]

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name_node = hxs.select(
            '//h1[@id="title"]/span[@id="productTitle"]/text()').extract()
        product_name = None
        if not product_name_node:
            product_name_node = hxs.select(
                '//h1[@id="aiv-content-title"]//text()').extract()
        if not product_name_node:
            product_name_node = hxs.select(
                '//div[@id="title_feature_div"]/h1//text()').extract()

        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            # needs special treatment
            product_name_node = hxs.select(
                '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()'
            ).extract()
            if product_name_node:
                product_name = " ".join(product_name_node).strip()

        if not product_name:

            # log this error:
            # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error
            if 'captcha_retries' in response.meta and response.meta[
                    'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.WARNING)
            else:
                # if it comes from a solved captcha page, then it's an error if it's still not found
                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.ERROR)

                # try this: don't remove captcha_retries from meta, may cause infinite loops, works
                # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES:
                # del response.meta['captcha_retries']
            # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list)

            # if we haven't reached maximum retries, try again
            if 'captcha_retries' not in response.meta \
                or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                # assume there is a captcha to crack
                # check if there is a form on the page - that means it's probably the captcha form
                forms = hxs.select("//form")
                if forms:

                    # solve captcha
                    captcha_text = None
                    image = hxs.select(".//img/@src").extract()
                    if image:
                        captcha_text = self.CB.solve_captcha(image[0])

                    # value to use if there was an exception
                    if not captcha_text:
                        captcha_text = ''

                    # create a FormRequest to this same URL, with everything needed in meta
                    # items, cookies and search_urls not changed from previous response so no need to set them again

                    # redo the entire request (no items will be lost)
                    meta = response.meta
                    # flag indicating how many times we already retried to solve captcha
                    if 'captcha_retries' in meta:
                        meta['captcha_retries'] += 1
                    else:
                        meta['captcha_retries'] = 1
                    return [
                        FormRequest.from_response(
                            response,
                            callback=self.parse_product_amazon,
                            formdata={'field-keywords': captcha_text},
                            meta=meta)
                    ]

        else:
            item['product_name'] = product_name

            # extract product model number
            model_number_holder = hxs.select(
                """//tr[@class='item-model-number']/td[@class='value']/text() |
             //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() |
             //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()"""
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            upc_node = hxs.select(
                "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()"
            ).extract()
            if upc_node:
                upc = upc_node[0].strip().split()
                item['product_upc'] = upc

            manufacturer_code_node = hxs.select(
                "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()"
            ).extract()
            if manufacturer_code_node:
                manufacturer_code = manufacturer_code_node[0].strip()
                item['manufacturer_code'] = manufacturer_code

            try:
                # for lowest level category:
                # TODO: test the xpath for the second type of page (see second type of xpath for top-level category)
                # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \
                # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0]

                # for top-level category:
                bestsellers_rank = hxs.select(
                    "//tr[@id='SalesRank']/td[@class='value']/text()" +
                    " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0]
                item['bestsellers_rank'] = int(
                    re.sub(",", "", "".join(bestsellers_rank[1:])))
            except Exception, e:
                if self.output == 6 or self.bestsellers_link:
                    self.log("Didn't find product rank: " + str(e) + " " +
                             response.url + "\n",
                             level=log.INFO)

            asin_node = hxs.select(
                "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()"
            ).extract()
            if asin_node:
                item['product_asin'] = asin_node[0].strip()

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
                "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(2))
                    currency = m.group(1)
                    if currency != "$":
                        item[
                            'product_target_price'] = Utils.convert_to_dollars(
                                item['product_target_price'], currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            try:
                item['product_category_tree'] = \
                    filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract()))
            except:
                pass

            try:
                item['product_keywords'] = hxs.select(
                    "//meta[@name='keywords']/@content").extract()[0]
            except:
                pass

            try:
                product_image = hxs.select(
                    "//img[@id='landingImage']/@src").extract()[0]
                item['product_image_url'] = product_image
                item['product_image_encoded'] = ProcessText.encode_image(
                    product_image)
            except:
                pass

            # add result to items
            self.results[origin_product_id]['search_requests'][current_query][
                'product_items'].append(item)
Ejemplo n.º 6
0
    def parseResults_samsung(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # add product URLs to be parsed to this list
        if 'search_results' not in response.meta:
            product_urls = set()
        else:
            product_urls = response.meta['search_results']

        #TODO: implement support for multiple results pages?

        # if we find any results to this it means we are already on a product page
        results = hxs.select("//ul[@class='product-info']")

        if results:
            product_urls.add(response.url)
            # it also means it's an exact match, so stop search here
            response.meta['pending_requests'] = []
            response.meta['threshold'] = 0.2
            # # also temporarily lower threshold
            # self.threshold = 0.2

        else:

            # try to see if this is a results page then

            # Content seems to be generated with javascript - open page with selenium, extract its content then return it back here
            # try to see if the page contains what we need, or we need to try it with selenium
            results = hxs.select(
                "//input[contains(@id,'detailpageurl')]/@value")
            if not results:
                print 'NO RESULTS: ', response.url

                #results = []

                # COMMENTED FOR TESTING
                # use selenium
                request = self.get_samsung_results(response.url)
                # get body of request
                request_body = request.body
                resp_for_scrapy = TextResponse('none', 200, {}, request_body,
                                               [], None)

                hxs = HtmlXPathSelector(resp_for_scrapy)
                #print "PAGE_SOURCE: ", page_source
                results = hxs.select(
                    "//input[contains(@id,'detailpageurl')]/@value")
            else:
                print 'WE ALREADY HAD RESULTS! '
                print 'RESULTS: ', results

            for result in results:
                product_url = Utils.add_domain(result.extract().strip(),
                                               "http://www.samsung.com")
                product_urls.add(product_url)

        if product_urls and ('pending_requests' not in response.meta
                             or not response.meta['pending_requests']):
            request = Request(product_urls.pop(),
                              callback=self.parse_product_samsung,
                              meta=response.meta)
            request.meta['items'] = items

            # this will be the new product_urls list with the first item popped
            request.meta['search_results'] = product_urls

            return request

        # if there were no results, the request will never get back to reduceResults
        else:

            # # we are finished and should close the driver
            # if self.driver:
            #     self.driver.close()

            response.meta['items'] = items
            response.meta['parsed'] = True
            response.meta['search_results'] = product_urls
            # only send the response we have as an argument, no need to make a new request
            return self.reduceResults(response)
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract number of products if available
        nrproducts_holder = hxs.select(
            "//div[@class='resultsfilterBottom']/div[@class='itemsShowresult']/strong[2]/text()"
        ).extract()
        if nrproducts_holder:
            item['nr_products'] = int(nrproducts_holder[0])

        # extract description if available
        description_holders = hxs.select("//div[@class='textBlock']")
        # if the list is not empty and contains at least one non-whitespace item
        if description_holders:
            description_texts = description_holders.select(
                ".//text()[not(ancestor::h2)]").extract()

            # replace all whitespace with one space, strip, and remove empty texts; then join them
            desc_text = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])
            if desc_text:
                item['description_text'] = desc_text

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)
            else:
                item['description_wc'] = 0

            description_title = description_holders.select(
                ".//h2/text()").extract()
            if description_title:
                item['description_title'] = description_title[0].strip()

                if desc_text:

                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        self.parsed_urls.append(item['url'])

        yield item

        # extract subcategories
        product_links = hxs.select(
            "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href"
        ).extract()
        # only extract subcategories if product links not found on page
        if not product_links:

            parent = item

            # search for a link to "See All Products"
            seeall = hxs.select(
                "//span[text()='See All Products']/parent::node()/@href"
            ).extract()
            if seeall:
                # pass the page with subcategories menu to a method to parse it
                #print 'parsing seeall: from ', response.url, ' to ', Utils.add_domain(seeall[0], "http://www.tigerdirect.com")
                yield Request(url = Utils.add_domain(seeall[0], "http://www.tigerdirect.com"), callback = self.parseSubcats, \
                    meta = {'parent' : parent,\
                     'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                     'department_id' : response.meta['department_id']})
            else:
                # pass the current page (with subcategories menu on it) to a method to parse it
                #print 'parsing for subcategories ', response.url
                yield Request(url = response.url, callback = self.parseSubcats, meta = {'parent' : parent,\
                    'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                    'department_id' : response.meta['department_id']})
Ejemplo n.º 8
0
    def parse(self, response):

        if self.product_name:

            # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites)
            if not self.target_site:
                self.log(
                    "You can't use the product_name option without setting the target site to search on\n",
                    level=log.ERROR)
                raise CloseSpider(
                    "\nYou can't use the product_name option without setting the target site to search on\n"
                )

            search_query = self.build_search_query(self.product_name)
            search_pages = self.build_search_pages(search_query)

            request = Request(search_pages[self.target_site],
                              callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request.meta['origin_name'] = self.product_name
            request.meta['query'] = search_query

            # just use empty product model and url, for compatibility, also pending_requests
            request.meta['origin_model'] = ''
            request.meta['origin_url'] = ''
            request.meta['pending_requests'] = []

            yield request

        # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults)
        product_urls = []
        # if we have a single product URL, create a list of URLs containing it
        if self.product_url:
            product_urls.append(self.product_url)

        # if we have a file with a list of URLs, create a list with URLs found there
        if self.product_urls_file:
            f = open(self.product_urls_file, "r")
            for line in f:
                product_urls.append(line.strip())
            f.close()

        for product_url in product_urls:
            # extract site domain

            # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url)
            # origin_site = ""
            # if m:
            # 	origin_site = m.group(1)
            # else:
            # 	sys.stderr.write('Can\'t extract domain from URL.\n')
            origin_site = Utils.extract_domain(product_url)

            request = Request(product_url, callback=self.parseURL)
            request.meta['origin_site'] = origin_site
            if origin_site == 'staples':
                zipcode = "12345"
                request.cookies = {"zipcode": zipcode}
                request.meta['dont_redirect'] = True
            yield request

        # if we have a file with Walmart ids, create a list of the ids there
        if self.walmart_ids_file:
            walmart_ids = []
            f = open(self.walmart_ids_file, "r")
            for line in f:
                if "," in line:
                    id_string = line.strip().split(",")[0]
                else:
                    id_string = line.strip()
                if re.match("[0-9]+", id_string):
                    walmart_ids.append(id_string)
            f.close()

            self.by_id = True

            for walmart_id in walmart_ids:
                # create Walmart URLs based on these IDs
                walmart_url = Utils.add_domain(walmart_id,
                                               "http://www.walmart.com/ip/")
                request = Request(walmart_url, callback=self.parseURL)
                #request.meta['origin_site'] = 'walmart'
                yield request
Ejemplo n.º 9
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:
        # Exceptions:
        #   http://www.walmart.com/cp/5431?povid=cat1078944-env506746-moduleA030213-lLinkLHNRelatedCategories2Pharmacy - finds wrong title (also wrong description holder - too high level)
        #   http://www.walmart.com/cp/1102793?povid=cat1094926-env999999-moduleA030713-lLinkLHNLearnmoreAbouttheprogram - finds description, actually no description, CustomPOV... with large text inside, hard to fix
        #   http://brands.walmart.com/fishing/essential-rods-and-reels/ - finds description, actually no description. Just an element with much text
        #   http://brands.walmart.com/fishing/get-salty-with-your-bass-skills/ - finds description, actually no description. Just an element with much text
        #   http://instoresnow.walmart.com/article.aspx?Center=Pets&id=104225 - finds description, actually no description. Just an element with much text
        #   http://brands.walmart.com/fishing/turn-a-kid-on-to-flyfishing/ - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/cp/1094926?povid=cat121828-env999999-moduleA030713-lLinkGNAV1_Campaign_EmpoweringWomenTogether - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/ip/Straight-Talk-Samsung-Galaxy-S-III/23573710?povid=cat1105910-env542259-moduleA092613-lLinkLHNWhatsNewSamsungSIIIStraightTalk - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/cp/Bakery/120764 - finds description, actually no description. Just an element with much text, also title problem
        #   http://www.walmart.com/cp/1078665 - not a description, also imperfect title extraction
        #   http://www.walmart.com/cp/1101244?povid=cat1100706-env999999-module122012-LHN_HealthyLivingTips - wrong title extraction, extracts too much as a description holder
        #   http://www.walmart.com/cp/flexible-spending-account/555326 - finds description though no description, just large text (also bad title extraction)

        # Idea for excluding elements with much text that are false positives: check if element is composed of many sibling paragraphs or so
        ###########################################

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item
Ejemplo n.º 10
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        parent_links = hxs.select(
            "//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']"
        )

        # for link in links:
        #     item = CategoryItem()

        #     # search for the category's parent
        #     parents = []

        #     # select the preceding siblings that are a category title (have a child that is an a tag with a certain class)
        #     parents = link.select('parent::node()').select('preceding-sibling::node()').select('child::a[@class=\'NavXLBold\']')

        #     # if we found such siblings, get the last one to be the parent
        #     if parents:
        #         item['parent_text'] = parents[-1].select('text()').extract()[0]
        #         item['parent_url'] = parents[-1].select('@href').extract()[0]

        #         item['parent_url'] = Utils.add_domain(item['parent_url'], self.root_url)

        #     item['text'] = link.select('text()').extract()[0]
        #     item['url'] = link.select('@href').extract()[0]

        #     # add domain if relative URL
        #     item['url'] = Utils.add_domain(item['url'], self.root_url)

        #     item['level'] = 0

        # to avoid duplicates, only extract highest level categories in this function (so don't return if level 0)
        #yield item

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        #TO remove:
        # # artificial category - parent to all departments (root of entire sitemap tree). used to get total walmart product count
        # sitemap_root = CategoryItem()
        # sitemap_root['url'] = "http://www.walmart.com"
        # sitemap_root['text'] = "Walmart"
        # sitemap_root['department_id'] = 0
        # sitemap_root['level'] = 2
        # sitemap_root['catid'] = 0
        # self.id_count += 1
        # yield sitemap_root

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})
Ejemplo n.º 11
0
    def parseResults(self, response):


        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # add product URLs to be parsed to this list
        if 'search_results' not in response.meta:
            product_urls = set()
        else:
            product_urls = response.meta['search_results']


        # TODO: check this xpath and extractions
        results = hxs.select("//div[@class='tileinfo']/a")

        for result in results:

            product_url = result.select("@href").extract()[0]
            product_url = Utils.add_domain(product_url, "http://www.maplin.co.uk")
            product_urls.add(product_url)

 
        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request further to parse product pages only if we gathered all the product URLs from all the queries 
        # (there are no more pending requests)
        # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results
        # this way we avoid duplicates
        if product_urls and ('pending_requests' not in response.meta or not response.meta['pending_requests']):
            request = Request(product_urls.pop(), callback = self.parse_product_maplin, meta = response.meta)
            request.meta['items'] = items

            # this will be the new product_urls list with the first item popped
            request.meta['search_results'] = product_urls

            return request

        # if there were no results, the request will never get back to reduceResults
        # so send it from here so it can parse the next queries
        # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet)
        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        else:
            response.meta['items'] = items
            response.meta['parsed'] = True
            response.meta['search_results'] = product_urls
            # only send the response we have as an argument, no need to make a new request

            # print "RETURNING TO REDUCE RESULTS", response.meta['origin_url']
            return self.reduceResults(response)
Ejemplo n.º 12
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        # print "PARSE AMAZON FOR", response.meta['origin_url'], "RESULTS PAGE", response.url

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # add product URLs to be parsed to this list
        if 'search_results' not in response.meta:
            product_urls = set()
        else:
            product_urls = response.meta['search_results']

        # get search results for received results page and add them to product_urls to be parsed
        results = hxs.select("//h3[@class='newaps']/a")
        for result in results:
            product_url = result.select("@href").extract()[0]

            # remove the part after "/ref" containing details about the search query
            m = re.match("(.*)/ref=(.*)", product_url)
            if m:
                product_url = m.group(1)

            product_url = Utils.add_domain(product_url,
                                           "http://www.amazon.com")

            product_urls.add(product_url)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request further to parse product pages only if we gathered all the product URLs from all the queries
        # (there are no more pending requests)
        # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results
        # this way we avoid duplicates
        if product_urls and ('pending_requests' not in response.meta
                             or not response.meta['pending_requests']):
            request = Request(product_urls.pop(),
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            if self.cookies_file:
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
            request.meta['items'] = items

            # this will be the new product_urls list with the first item popped
            request.meta['search_results'] = product_urls

            return request

        # if there were no results, the request will never get back to reduceResults
        # so send it from here so it can parse the next queries
        # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet)
        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        else:
            response.meta['items'] = items
            response.meta['parsed'] = True
            response.meta['search_results'] = product_urls
            # only send the response we have as an argument, no need to make a new request

            # print "RETURNING TO REDUCE RESULTS", response.meta['origin_url']
            return self.reduceResults(response)
Ejemplo n.º 13
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//h3[@class='productTitle']/a")
        for result in results:
            item = SearchItem()
            product_url = result.select("@href").extract()[0]
            # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text())

            # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't
            product_name_node = result.select("strong/abbr/@title")
            product_name = product_name_node.extract(
            )[0] if product_name_node else result.select(
                "strong/text()").extract()[0]
            # assert name is not abbreviated
            assert '...' not in product_name
            # add product quantity
            product_quantity_node = result.select(
                "text()[normalize-space()!='']")
            product_quantity = product_quantity_node.extract()[0].strip(
            ) if product_quantity_node else ""
            product_name_full = product_name + " " + product_quantity

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(
                    Utils.clean_url(product_url), self.base_url)

                item['product_name'] = product_name_full
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Ejemplo n.º 14
0
    def parsePage(self, response):

        #print "IN PARSEPAGE"
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        if 'parent_item' in response.meta:
            parent_item = response.meta['parent_item']
            item['parent_text'] = parent_item['text']
            item['parent_url'] = parent_item['url']
            if 'parent_text' in parent_item:
                item['grandparent_text'] = parent_item['parent_text']
                item['grandparent_url'] = parent_item['parent_url']
            if 'nr_products' not in parent_item:
                parent_nr_products = 0
            else:
                parent_nr_products = parent_item['nr_products']

        # initialize product URL list
        if 'products' not in response.meta:
            products = []
        else:
            products = response.meta['products']

        # # if this is the first page, initialize number of products
        # if 'nr_products' not in item:
        #     old_nr_products = 0
        # else:
        #     old_nr_products = item['nr_products']

        # find number of products on this page
        product_links = hxs.select(
            "//a[@class='prodLink ListItemLink']/@href").extract()

        # gather all products in this (sub)category
        products += product_links

        #this_nr_products = len(product_links)

        #item['nr_products'] = old_nr_products + this_nr_products
        # if 'parent_item' in response.meta:
        #     parent_item['nr_products'] = parent_nr_products + item['nr_products']
        # find URL to next page, parse it as well
        next_page = hxs.select(
            "//a[@class='link-pageNum' and text()=' Next ']/@href").extract()
        if next_page:
            page_url = Utils.add_domain(next_page[0], self.root_url)
            request = Request(url=page_url,
                              callback=self.parsePage,
                              meta={
                                  'item': item,
                                  'products': products
                              })
            if 'parent_item' in response.meta:
                request.meta['parent_item'] = parent_item
            yield request

        # if no next page, return current results; and return parent category page
        else:

            item['nr_products'] = len(set(products))
            yield item
Ejemplo n.º 15
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # extract additional info for received parent and return it
        item = response.meta['item']

        # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url)
        if 'nr_products' not in item:
            prod_count_holder = hxs.select("//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                prod_count = prod_count_holder[0]
                # extract number
                m = re.match(".*\s*of\s*([0-9,]+)\s*Results\s*", prod_count)
                if m:
                    item['nr_products'] = int(re.sub(",","",m.group(1)))

        # extract description if available
        # only extracts descriptions that contain a h2. is that good?
        desc_holders = hxs.select("//div[@class='unified_widget rcmBody'][descendant::h2][last()]")
        # select the one among these with the most text
        #TODO: another idea: check if the holder has a h2 item
        if desc_holders:
            maxsize = 0
            max_desc_holder = desc_holders[0]
            for desc_holder in desc_holders:
                size = len(" ".join(desc_holder.select(".//text()").extract()))

                if size > maxsize:
                    maxsize = size
                    max_desc_holder = desc_holder
            desc_holder = max_desc_holder
            desc_title = desc_holder.select("h2/text()").extract()
            if desc_title:
                item['description_title'] = desc_title[0].strip()
            
            description_texts = desc_holder.select(".//text()[not(ancestor::h2)]").extract()

            # if the list is not empty and contains at least one non-whitespace item
            # if there is a description title or the description body is large enough
            size_threshold = 50
            if (description_texts and reduce(lambda x,y: x or y, [line.strip() for line in description_texts])):# and \
            #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                if desc_title:
                    (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text'])
            
            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0


        # if item is found among extra_toplevel_categories_urls, and no product count was found, add info from that url
        extra_category = self.find_matching_key(item['text'], self.extra_toplevel_categories_urls)

        #yield item

        # crawl level 0 categories (only for their product count and subcategories - no descriptions...)

        if 'nr_products' not in item or item['level'] > self.LEVEL_BARRIER:
            if extra_category:
            
                # collect number of products from this alternate URL
                # this will also extract subcategories and their count
                yield Request(self.extra_toplevel_categories_urls[extra_category], callback = self.extract_nrprods_and_subcats, meta = {'item' : item})

            else:
                # extract subcategories and their count for category even if not in extra_...
                yield Request(item['url'], callback = self.extract_nrprods_and_subcats, meta = {'item' : item})
        else:
            yield item
Ejemplo n.º 16
0
    def parse(self, response):

        items = []

        # extract site domain
        site = Utils.extract_domain(response.url)
        if not site:
            return items

        # handle staples televisions
        if site == 'staples':

            ############################################
            #
            # # Use selenium - not necessary anymore

            # # zipcode = "12345"

            # # hxs = HtmlXPathSelector(response)
            # # return Request(self.cat_page, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, meta = {"dont_redirect" : False})
            # # use selenium to complete the zipcode form and get the first results page
            # driver = webdriver.Firefox()
            # driver.get(response.url)

            # # set a hardcoded value for zipcode
            # zipcode = "12345"
            # textbox = driver.find_element_by_name("zipCode")

            # if textbox.is_displayed():
            # 	textbox.send_keys(zipcode)

            # 	button = driver.find_element_by_id("submitLink")
            # 	button.click()

            # 	cookie = {"zipcode": zipcode}
            # 	driver.add_cookie(cookie)

            # 	time.sleep(5)

            # # convert html to "nice format"
            # text_html = driver.page_source.encode('utf-8')
            # #print "TEXT_HTML", text_html
            # html_str = str(text_html)

            # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)
            # #resp_for_scrapy = TextResponse(html_str)

            # # pass first page to parsePage function to extract products
            # items += self.parsePage_staples(resp_for_scrapy)

            # # use selenium to get next page, while there is a next page
            # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")
            # while (next_page):
            # 	next_page.click()
            # 	time.sleep(5)

            # 	# convert html to "nice format"
            # 	text_html = driver.page_source.encode('utf-8')
            # 	#print "TEXT_HTML", text_html
            # 	html_str = str(text_html)

            # 	# this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            # 	resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)
            # 	#resp_for_scrapy = TextResponse(html_str)

            # 	# pass first page to parsePage function to extract products
            # 	items += self.parsePage_staples(resp_for_scrapy)

            # 	hxs = HtmlXPathSelector(resp_for_scrapy)
            # 	next = hxs.select("//li[@class='pageNext']/a")
            # 	next_page = None
            # 	if next:
            # 		next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")

            # 	#TODO: this doesn't work
            # 	# try:
            # 	# 	next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a")
            # 	# 	break
            # 	# except NoSuchElementException:
            # 	# 	# if there are no more pages exit the loop
            # 	# 	driver.close()
            # 	# 	return items

            # driver.close()

            # return items
            #
            ##############################################

            zipcode = "12345"
            request = Request(response.url, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, \
             headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True})
            return request

        # handle bloomingdales sneakers
        if site == 'bloomingdales':
            driver = webdriver.Firefox()
            driver.get(response.url)

            # use selenium to select USD currency
            link = driver.find_element_by_xpath(
                "//li[@id='bl_nav_account_flag']//a")
            link.click()
            time.sleep(5)
            button = driver.find_element_by_id("iShip_shipToUS")
            button.click()
            time.sleep(10)

            # convert html to "nice format"
            text_html = driver.page_source.encode('utf-8')
            html_str = str(text_html)

            # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
            resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None)

            # parse first page with parsePage_bloomingdales function
            items += self.parsePage_bloomingdales(resp_for_scrapy)
            hxs = HtmlXPathSelector(resp_for_scrapy)

            # while there is a next page get it and pass it to parsePage_bloomingdales
            next_page_url = hxs.select("//li[@class='nextArrow']//a")

            while next_page_url:

                # use selenium to click on next page arrow and retrieve the resulted page if any
                next = driver.find_element_by_xpath(
                    "//li[@class='nextArrow']//a")
                next.click()

                time.sleep(5)

                # convert html to "nice format"
                text_html = driver.page_source.encode('utf-8')
                html_str = str(text_html)

                # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
                resp_for_scrapy = TextResponse('none', 200, {}, html_str, [],
                                               None)

                # pass the page to parsePage function to extract products
                items += self.parsePage_bloomingdales(resp_for_scrapy)

                hxs = HtmlXPathSelector(resp_for_scrapy)
                next_page_url = hxs.select("//li[@class='nextArrow']//a")

            driver.close()

            return items

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'walmart':
            hxs = HtmlXPathSelector(response)

            # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL
            #! this has a space after the div class, maybe in other pages it doesn't
            seeall = hxs.select(
                "//div[@class='CustomSecondaryNav ']//li[last()]/a/@href"
            ).extract()
            if seeall:
                root_url = "http://www.walmart.com"
                page_url = root_url + seeall[0]
                # send the page to parsePage and extract product URLs
                request = Request(page_url, callback=self.parsePage_walmart)
                return request
            # if you can't find the link to the product list page, try to parse this as the product list page
            else:
                return Request(response.url, callback=self.parsePage_walmart)

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'amazon':
            hxs = HtmlXPathSelector(response)
            # select first see more list ("All Televisions")
            seeall = hxs.select("//p[@class='seeMore'][1]/a/@href").extract()
            root_url = "http://www.amazon.com"

            # if we can find see all link, follow it and pass it to parsePage to extract product URLs
            if seeall:
                page_url = root_url + seeall[0]
                return Request(page_url, callback=self.parsePage_amazon)

            # otherwise, try to parse current page as product list page
            else:
                return Request(response.url, callback=self.parsePage_amazon)

        # works for both product list pages and higher level pages with links in the left side menu to the product links page
        if site == 'bestbuy':
            hxs = HtmlXPathSelector(response)

            # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL
            seeall_list = hxs.select("//ul[@class='search']")
            if seeall_list:
                seeall = seeall_list[0].select("li[1]/a/@href").extract()
                if seeall:
                    root_url = "http://www.bestbuy.com"
                    page_url = root_url + seeall[0]

                    # send the page to parsePage and extract product URLs
                    return Request(page_url, callback=self.parsePage_bestbuy)

                else:
                    return Request(response.url,
                                   callback=self.parsePage_bestbuy)

            # if you can't find the link to the product list page, try to parse this as the product list page
            else:
                return Request(response.url, callback=self.parsePage_bestbuy)

        if site == 'nordstrom':
            hxs = HtmlXPathSelector(response)

            return Request(response.url, callback=self.parsePage_nordstrom)

        if site == 'macys':

            hxs = HtmlXPathSelector(response)

            m = re.match("http://www1.macys.com/shop(.*)\?id=([0-9]+).*",
                         self.cat_page)
            cat_id = 0
            if m:
                cat_id = int(m.group(2))
            productids_request = "http://www1.macys.com/catalog/category/facetedmeta?edge=hybrid&categoryId=%d&pageIndex=1&sortBy=ORIGINAL&productsPerPage=40&" % cat_id
            return Request(productids_request,
                           callback=self.parse_macys,
                           headers={"Cookie": "shippingCountry=US"},
                           meta={
                               'dont_merge_cookies': True,
                               "cat_id": cat_id,
                               "page_nr": 1
                           })

        if site == 'williams-sonoma':

            return Request(url=self.cat_page, callback=self.parsePage_sonoma)

        #TODO: is the list of product numbers ok for all pages? got if from laptops category request, seems to work for others as well even though it's not the same
        if site == 'overstock':
            # # get category, and if it's laptops treat it specially using the hardcoded url
            # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page)
            # if m and m.group(1) == "Laptops":
            return Request(url = self.cat_page + "&index=1&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \
             headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \
             meta = {"index" : 1})
            # else:
            # 	return Request(url = self.cat_page, callback = self.parsePage_overstock)

        if site == 'newegg':
            return Request(url=self.cat_page,
                           callback=self.parsePage_newegg,
                           meta={'page': 1})

        if site == 'tigerdirect':
            return Request(url = self.cat_page, callback = self.parsePage_tigerdirect,\
				# add as meta the page number and the base URL to which to append page number if necessary

             meta = {'page' : 1, 'base_url' : self.cat_page})
Ejemplo n.º 17
0
    def extract_nrprods_and_subcats(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract nr_products if not already extracted. necessary for extra_categories
        if 'nr_products' not in item:
            prod_count_holder = hxs.select("//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                #print "DIDN'T HAVE PRODUCT COUNT", response.url
                prod_count = prod_count_holder[0]
                # extract number
                m = re.match(".*\s*of\s*([0-9,]+)\s*Results\s*", prod_count)
                if m:
                    item['nr_products'] = int(re.sub(",","",m.group(1)))

        yield item

        parent_item = item

        # extract subcategories, if level is above barrier
        # currently extracting subcategories for categories on any level, for level 2 this may cause duplicates (we already extract level 1)
        # extract subcategories from first menu on the left, assume this is the subcategories menu
        #TODO: test or make more robust

        if item['level'] > self.LEVEL_BARRIER:
            subcategories = hxs.select("//h2[1]/following-sibling::ul[1]/li/a")
            for subcategory in subcategories:
                # if we have a subcategory URL and product count with the expected format extract it, otherwise move on
                if not subcategory.select("span[@class='refinementLink']"):
                    continue
                subcategory_url = Utils.add_domain(subcategory.select("@href").extract()[0], "http://www.amazon.com")
                subcategory_text = subcategory.select("span[@class='refinementLink']//text()").extract()[0].strip()
                # extract product count, clean it of commas and parantheses
                subcategory_prodcount_holder = subcategory.select("span[@class='narrowValue']/text()").extract()
                if not subcategory_prodcount_holder:
                    continue
                subcategory_prodcount = subcategory_prodcount_holder[0].replace(";nbsp&"," ").strip()

                m = re.match("\(([0-9,]+)\)", subcategory_prodcount)
                if m:
                    subcategory_prodcount = m.group(1).replace(",","")
                

                item = CategoryItem()
                item['url'] = subcategory_url
                item['text'] = subcategory_text

                item['parent_text'] = parent_item['text']
                item['parent_url'] = parent_item['url']

                # considering departments to be level 2 categories (top level) - so every category must have a department text
                assert 'department_text' in parent_item
                if 'department_text' in parent_item:
                    item['department_text'] = parent_item['department_text']
                    #item['department_url'] = parent_item['department_url']
                    item['department_id'] = parent_item['department_id']

                # only level 2 categories in extra_categories have department_url
                if 'department_url' in parent_item:
                    item['department_url'] = parent_item['department_url']
                else:
                    assert not self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls)
                    if self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls):
                        print "DEPARTMENT_TEXT", item['department_text'], "--"+str(self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls))+"--"

                # else:
                #     # the parent must be a level 2 category - so this will be considered department
                #     assert parent_item['level'] == 2
                #     item['department_text'] = item['text']
                #     #item['department_url'] = item['url']
                #     item['department_id'] = self.department_count
                #     self.department_count += 1

                item['level'] = parent_item['level'] - 1

                item['nr_products'] = subcategory_prodcount

                # # no description extracted
                # item['description_wc'] = 0


                # send to parseCategory to extract description as well
                yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})
Ejemplo n.º 18
0
#!/usr/bin/python

# Get ids from a CSV file containing one on each line, and generate Walmart product URLs based on them

import sys
import re
from spiders_utils import Utils

base_url = "http://www.walmart.com/ip/"
with open(sys.argv[1]) as idsfile:
    for line in idsfile:
        # if there are other fields ignore them (get the first one)
        if "," in line:
            id_string = line.strip().split(",")[0]
        else:
            id_string = line.strip()
        # if it's not a number ignore it (could be a header line)
        if re.match("[0-9]+", id_string):
            # generate URL and output it
            url = Utils.add_domain(id_string, base_url)
            print url
Ejemplo n.º 19
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # get parent item from response, extract additional info and return it
        item = response.meta['parent']

        # add department name, url and id for item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract product count if available
        nr_items_holder = hxs.select(
            "//div[@id='showing']/strong[position()=2]/text()").extract()
        if nr_items_holder:
            item['nr_products'] = int(str(nr_items_holder[0]))

        # extract description if available
        # these are descriptions for  services pages
        desc_title_holder = hxs.select(
            "//div[@id='searchstate']/a[position()=2]/text()").extract()
        if desc_title_holder:
            item['description_title'] = desc_title_holder[0].strip()
        desc_content_holder = hxs.select(
            "//div[@class='content']/h3/text()").extract()
        if desc_content_holder:
            item['description_text'] = desc_content_holder[0].strip()
            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)
            (item['keyword_count'],
             item['keyword_density']) = Utils.phrases_freq(
                 item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        # extract its subcategories
        #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']")
        subcats_holders = hxs.select(
            "//div[@class='narrowcontent']/ul[contains(@class,'search')]")
        if subcats_holders:
            subcats_holder = subcats_holders[0]
            # these are subcategories if they are preceded by the title "Shop ..."
            title = subcats_holder.select(
                "parent::node()/preceding-sibling::node()//text()").extract(
                )[0]
            if str(title).startswith("Shop"):
                subcats = subcats_holder.select(".//li/a")
                for subcat in subcats:
                    item = CategoryItem()
                    item['text'] = subcat.select("text()").extract()[0].strip()
                    item['url'] = Utils.add_domain(
                        subcat.select("@href").extract()[0],
                        "http://www.bestbuy.com")
                    parent = response.meta['parent']
                    item['level'] = int(response.meta['level']) - 1
                    # if parent was special, this category is special too
                    if 'special' in parent:
                        item['special'] = 1
                    item['parent_text'] = parent['text']
                    item['parent_url'] = parent['url']

                    request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                        'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                    yield request
Ejemplo n.º 20
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract number of products if available
        #TODO check
        count_holder = hxs.select(
            "//div[@class='recordCount']/span[@id='RecordCount_1']/text()")
        if count_holder:
            item['nr_products'] = int(count_holder.extract()[0])

        #TODO
        # try to change URL "Category" to "SubCategory", see if you find the product count there

        # extract description if available
        description_holders = hxs.select("//div[@id='bcaShopWindowSEO']")
        # if the list is not empty and contains at least one non-whitespace item
        if description_holders:
            description_texts = description_holders.select(
                ".//text()[not(ancestor::h2)]").extract()

            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)

            description_title = description_holders.select(
                ".//h2/text()").extract()
            if description_title:
                item['description_title'] = description_title[0].strip()

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        parent = item

        #TODO
        # extract and parse subcategories
        subcats = hxs.select("//dl[@class='categoryList primaryNav']/dd/a")
        for subcat in subcats:
            item = CategoryItem()

            item['text'] = subcat.select("text()").extract()[0].strip()

            #TODO: check out some huge URLs
            item['url'] = self.clean_url(subcat.select("@href").extract()[0])

            item['parent_text'] = parent['text']
            item['parent_url'] = parent['url']
            item['level'] = parent['level'] - 1
            item['department_text'] = response.meta['department_text']
            item['department_url'] = response.meta['department_url']
            item['department_id'] = response.meta['department_id']

            yield Request(url = item['url'], callback = self.parseCategory, meta = {"item" : item, \
                "department_text" : response.meta['department_text'], "department_url" : response.meta['department_url'], "department_id" : response.meta['department_id']})
Ejemplo n.º 21
0
    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.xpath(
                "//h1[starts-with(@class,'title')]//text()").extract(
                )[0].strip()
        except:
            try:
                item['product_name'] = hxs.xpath(
                    "//div[@class='pdp_title']//text()[normalize-space()!='']"
                ).extract()[0].strip()
            except:
                try:
                    item['product_name'] = hxs.xpath(
                        "//h1//text()").extract()[0].strip()
                except:
                    # out of stock products return 404s with this text, not the actual product page
                    out_of_stock = hxs.xpath(
                        "//strong[contains(text(),'out of stock')]").extract()
                    if not out_of_stock:
                        self.log("Error: No product name: " +
                                 str(response.url) + " from product: " +
                                 item['origin_url'],
                                 level=log.ERROR)
                    # ignore products with no name
                    return None

        price_node = hxs.select("//meta[@itemprop='price']/@content").extract()

        if price_node:

            try:
                price_currency = price_node[0][0]
                price_amount = "".join(price_node[0][1:])

                price_amount = re.sub(",", "", price_amount)

                m1 = re.match("[0-9]+\.?[0-9]*", price_amount)
                m2 = re.match("(\xa3)|(\$)", price_currency)
                if not m1 or not m2:
                    self.log("Didn't match product price: " + price_amount +
                             price_currency + " " + response.url + "\n",
                             level=log.WARNING)
                else:
                    price = Utils.convert_to_dollars(float(price_amount),
                                                     price_currency)
                    item['product_target_price'] = price
            except Exception:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

        try:
            product_model_node = hxs.select(
                "//div[@class='prod_description1']//li[contains(text(), 'Style')]/text()"
            ).re("[sS]tyle +[nN]o\.? +[a-zA-Z0-9]+")
            item['product_model'] = re.match(
                "[sS]tyle +[nN]o\.? +([a-zA-Z0-9]+)",
                product_model_node[0]).group(1)
        except Exception:
            pass

        try:
            item['product_brand'] = hxs.select(
                "//meta[@itemprop='brand']/@content").extract()[0]
        except Exception:
            pass

        try:
            js_body = hxs.select(
                "//script[contains(text(),'Upc')]/text()").extract()[0]
            item['product_upc'] = re.match('.*"skuUpcCode":"([0-9a-zA-Z]+)".*',
                                           js_body,
                                           re.DOTALL | re.MULTILINE).group(1)
        except Exception:
            pass

        return item
Ejemplo n.º 22
0
    def parseDept(self, response):

        # for "copy & print" there's an exception, we don't need zipcode

        # # use selenium to complete the zipcode form and get the first results page
        # driver = webdriver.Firefox()
        # driver.get(response.url)

        # # set a hardcoded value for zipcode
        # zipcode = "12345"

        # textbox = driver.find_element_by_name("zipCode")
        # textbox.send_keys(zipcode)

        # button = driver.find_element_by_id("submitLink")
        # button.click()

        # cookie = {"zipcode": zipcode}
        # driver.add_cookie(cookie)

        # time.sleep(5)

        # # convert html to "nice format"
        # text_html = driver.page_source.encode('utf-8')
        # #print "TEXT_HTML", text_html
        # html_str = str(text_html)

        # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
        # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)

        # hxs = HtmlXPathSelector(resp_for_scrapy)

        #TODO: doesn't extract Televisions for ex

        hxs = HtmlXPathSelector(response)
        categories = hxs.select("//h2/a")

        root_url = "http://www.staples.com"

        # from parent's page:
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items, if any
        nritems_holder = hxs.select(
            "//div[@class='perpage']/span[@class='note']/text()").extract()
        if nritems_holder:
            m = re.findall("[0-9]+\s*items", nritems_holder[0])
            if m:
                item['nr_products'] = int("".join(re.findall("[0-9]+", m[0])))
            # else:
            # 	print "NOT MATCH ", nritems_holder[0]

        # extract description, if any
        description_texts = hxs.select(
            "//h2[@class='seo short']//text() | //h2[@class='seo short long']//text()"
        ).extract()
        if description_texts and reduce(
                lambda x, y: x or y,
            [line.strip() for line in description_texts]):
            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            if item['description_text']:
                item['description_title'] = item['text']

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                # if no description is found
                #print 'desc_holder but no desc_text ', response.URL
                item['description_wc'] = 0
        else:
            item['description_wc'] = 0

        # yield item the request came from (parent)
        yield item

        # extract subcategories
        for category in categories:
            # there are pages that don't have categories
            item = CategoryItem()
            text = category.select("text()").extract()
            if text:
                item['text'] = text[0]
            url = category.select("@href").extract()
            if url:
                item['url'] = root_url + url[0]
            item['level'] = int(response.meta['level'] - 1)
            if 'text' in response.meta['parent']:
                item['parent_text'] = response.meta['parent']['text']
            else:
                print 'no text in parent ', response.meta['parent']
            item['parent_url'] = response.url

            # yield the item after passing it through request and collecting additonal info
            #yield item

            # extract subcategories if any
            zipcode = "12345"
            request = Request(item['url'], callback = self.parseDept, cookies = {"zipcode" : zipcode}, \
             headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, \
             "parent": item, "level": item['level'], \
             "department_text" : response.meta["department_text"], "department_url" : response.meta["department_url"], "department_id" : response.meta["department_id"]})
            yield request
Ejemplo n.º 23
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        # extract departments
        departments = hxs.select("//h2")
        department_id = 0
        for department in departments:
            department_item = CategoryItem()
            department_text = department.select("text()").extract()[0]

            department_item['department_text'] = department_text

            # #TODO: add department_url, from sherwin-williams.com ...? get department list from there and match with departments from here by seeing if names match

            department_item['department_id'] = department_id

            department_item['text'] = department_text

            department_item['level'] = 1

            # get categories in department
            categories = department.select("following-sibling::ul[1]/li")

            # extract department url from one of its categories urls (it's not available directly)
            category_ex = categories[0]
            category_ex_url = Utils.add_domain(
                category_ex.select("a/@href").extract()[0], self.base_url)
            # extract first part of url
            m = re.match("(http://www.sherwin\-williams\.com/[^/]+)/.*",
                         category_ex_url)
            department_url = m.group(1)
            department_item['department_url'] = department_url
            department_item['url'] = department_url

            for category in categories:
                item = CategoryItem()
                #TODO: special if 'Services'? or Specifications, or Ads...
                category_text = category.select("a/text()").extract()[0]
                category_url = Utils.add_domain(
                    category.select("a/@href").extract()[0], self.base_url)
                item['text'] = category_text
                item['url'] = category_url

                # if it's not a 'products' category, mark it and all its subcategories as special

                if category_text != 'Products':
                    item['special'] = 1
                    special = True
                else:
                    special = False

                item['department_id'] = department_id
                item['department_text'] = department_text
                item['department_url'] = department_url

                item['parent_text'] = department_text
                item['parent_url'] = department_url

                item['level'] = 0

                #TODO: do we need description_wc here as well?

                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

                # get subcategories in category
                subcategories = category.select("ul/li")
                for subcategory in subcategories:
                    item = CategoryItem()

                    item['text'] = subcategory.select("a/text()").extract()[0]
                    item['url'] = Utils.add_domain(
                        subcategory.select("a/@href").extract()[0],
                        self.base_url)

                    item['department_id'] = department_id
                    item['department_text'] = department_text
                    item['department_url'] = department_url

                    item['parent_text'] = category_text
                    item['parent_url'] = category_url

                    item['level'] = -1

                    # if parent is special, category is special
                    if special:
                        item['special'] = 1

                    yield Request(item['url'],
                                  callback=self.parseSubcategory,
                                  meta={'item': item})

            department_id += 1

            # return department
            yield department_item
Ejemplo n.º 24
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//ul[@class='products']//div[@class='product ']//h3//a")
        for result in results:
            item = SearchItem()

            product_url = result.select("@href").extract()[0] if result.select(
                "@href") else None
            product_name = result.select(
                "@title").extract()[0] if result.select("@title") else None

            # assert name is not abbreviated
            # empirically, this only seems to produce false positives, so removed
            # assert '...' not in product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(product_url,
                                                       self.base_url)

                item['product_name'] = product_name
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Ejemplo n.º 25
0
    def parseCategory(self, response):

        #TODO: add extraction of additional category info
        sel = Selector(response)

        #TODO: a lot of redirects. maybe for item, set 'url' to the one to which it was redirected? (response.url)
        item = response.meta['item']

        # Description extraction needs to be done first because it can be found in regular /c/ pages that are first passed to this method.
        # For other info (item count, subcategories), the spider will redirect to different page if necessary (where description won't be available)
        # extract description
        description_texts = sel.xpath(
            "//div[@class='subpart']/p//text()").extract()

        # second try at finding descriptions
        if not description_texts:
            description_texts = sel.xpath(
                "//div[@id='SEO_TEXT']//text()").extract()

        # replace all whitespace with one space, strip, and remove empty texts; then join them
        if description_texts:
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)

        else:
            item['description_wc'] = 0

        # try to extract item count - if alternative extraction needs to be done.
        # this item's parsing will be redirected through different method and returned here

        # extract item count
        nr_products_node = sel.xpath("//ul[@class='results']//strong/text()")
        if nr_products_node:
            # nr of products is in the second of these nodessel.xpath("//ul[@class='results']//strong/text()")
            nr_products = nr_products_node.extract()[1].strip()
            item['nr_products'] = int(nr_products)

        # alternative item count: try on same page, but with /sb/ instead of /c/ in url
        if not nr_products_node:
            m = re.match("http://www\.target\.com/c/(.*)", response.url)
            if m:
                new_url = "http://www.target.com/sb/" + m.group(1)

                # retry to this same method but with new url
                #TODO: will miss descriptions. leave it to the end of the method then. but I want subcats from that one too?
                #OR extract it in secondary method and send it back to original url
                yield Request(new_url,
                              callback=self.parseCategory,
                              meta={'item': item})

            else:
                if "/sb/" not in new_url:
                    print "DOES NOT MATCH", response.url

        # alternative item count extraction 2 (dynamically generated content)
        if not nr_products_node:

            # extract dynamycally loaded data by making an additional request (made by the page to load the data)
            # extract url and parameters from form data
            form = sel.xpath("//form[@name='dynamicAjaxFrm1']")
            if form:
                form_action = form.xpath("@action").extract()[0]
                form_inputs = form.xpath("input")
                # build string of parameters from input names and values
                param_dict = {
                    form_input.xpath("@name").extract()[0]:
                    form_input.xpath("@value").extract()[0]
                    for form_input in form_inputs
                }
                param_string = urllib.urlencode(param_dict)
                # build url to make request to
                new_url = "http://www.target.com" + form_action + "&" + param_string

                # if this url was found, redirect request to new method to extract item count as well, that method will yield the item
                # only redirect to this method if we weren't already redirected from it - to avoid redirect loop
                if 'redirected' not in response.meta or not response.meta[
                        'redirected']:
                    yield Request(new_url,
                                  callback=self.parseCategoryDyncontent,
                                  meta={'item': item})
                    return

        #TODO: add description title as category name if no title available?
        # then also add the keyword/density count

        yield item

        if 'parent_url' in item:
            self.crawled_urls.append((item['url'], item['parent_url']))

        # extract subcategories (if we haven't reached level barrier)
        if item['level'] <= self.LEVEL_BARRIER:
            return

        parent_item = item

        # "shop categories" menu
        #subcategories = sel.xpath("//h3[text() = 'shop categories']/following-sibling::ul/li/a")
        #TODO: replace the not startswith with != ?
        subcategories_menu = sel.xpath(
            "//h3[starts-with(text(), 'shop ') and not(starts-with(text(), 'shop by')) \
            and not(starts-with(text(), 'shop for')) and not(starts-with(text(), 'shop favorite')) and not(contains(text(), ' size'))]"
        )
        subcategories = subcategories_menu.xpath("following-sibling::ul/li/a")

        for subcategory in subcategories:
            subcategory_item = CategoryItem()

            subcategory_item['text'] = subcategory.xpath(
                "text()").extract()[0].strip()
            subcategory_item['url'] = self.build_url(
                subcategory.xpath("@href").extract()[0])

            # filter duplicates
            if (subcategory_item['url'],
                    parent_item['url']) in self.crawled_urls:
                # print subcategory_item['url']
                # print parent_item['url']
                continue

            # assign next available category id
            self.catid += 1
            subcategory_item['catid'] = self.catid

            subcategory_item['level'] = parent_item['level'] - 1

            subcategory_item['parent_url'] = parent_item['url']
            subcategory_item['parent_text'] = parent_item['text']
            subcategory_item['parent_catid'] = parent_item['catid']

            subcategory_item['department_text'] = parent_item[
                'department_text']
            subcategory_item['department_url'] = parent_item['department_url']
            subcategory_item['department_id'] = parent_item['department_id']

            # send this subcategory to be further parsed
            yield Request(subcategory_item['url'],
                          callback=self.parseCategory,
                          meta={'item': subcategory_item})