Ejemplos de CategoryItem en Python, ejemplos de Categories.items.CategoryItem en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: overstock_spider.py Proyecto: lifelonglearner127/tmtext

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        # currently selecting bottom level categories, and their parents and parents of parents ("grandparents") in their fields
        links = hxs.select("//div[@id='sitemap']//li[@class='bullet3']//a")
        parent_links = hxs.select("//div[@id='sitemap']//li[@class='bullet2']//a")
        grandparent_links = hxs.select("//div[@id='sitemap']//li[@class='bullet1']//a")
        items = []

        #TODO: mark special categories (if appropriate for any)

        for link in links:

            # extract immediate parent of this link (first preceding sibling (of the parent node) with class='bullet2')
            parent = link.select("parent::node()/preceding-sibling::*[@class='bullet2'][1]/a")
            # extract grandparent of this link (first preceding sibling of the parent's parent node witch class='bullet1')
            grandparent = parent.select("parent::node()/preceding-sibling::*[@class='bullet1'][1]/a")

            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = parent.select('@href').extract()[0]

            item['grandparent_text'] = grandparent.select('text()').extract()[0]
            item['grandparent_url'] = grandparent.select('@href').extract()[0]

            # this will be considered lower than the main level, because these categories are very detailed
            item['level'] = -1

            items.append(item)

        for link in parent_links:

            # extract immediate parent of this link (first preceding sibling (of the parent node) with class='bullet2')
            parent = link.select("parent::node()/preceding-sibling::*[@class='bullet1'][1]/a")

            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = parent.select('@href').extract()[0]

            # this will be considered the main level of the nested list (it's comparable with the main level of the other sitemaps)
            item['level'] = 0

            items.append(item)

        for link in grandparent_links:

            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1

            items.append(item)

        return items

Ejemplo n.º 2

0

Mostrar archivo

Archivo: tigerdirect_spider.py Proyecto: lifelonglearner127/tmtext

    def parseSubcats(self, response):
        hxs = HtmlXPathSelector(response)

        parent = response.meta['parent']

        # extract subcategories
        subcats_links = hxs.select(
            "//div[@class='sideNav']/div[@class='innerWrap'][1]//ul/li/a")
        for subcat_link in subcats_links:
            item = CategoryItem()

            item['url'] = Utils.add_domain(
                subcat_link.select("@href").extract()[0],
                "http://www.tigerdirect.com")
            item['text'] = subcat_link.select("text()").extract()[0]

            item['parent_text'] = parent['text']
            item['parent_url'] = parent['url']
            item['level'] = parent['level'] - 1

            item['department_text'] = response.meta['department_text']
            item['department_id'] = response.meta['department_id']
            item['department_text'] = response.meta['department_text']

            #print 'passing to parse category ', item

            # there are some loops in their categories tree, so we need to check this to avoid infinite loops in crawling
            if item['url'] not in self.parsed_urls:
                yield Request(url = item['url'], callback = self.parseCategory,\
                 meta = {'item' : item,\
                 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                  'department_id' : response.meta['department_id']})

Ejemplo n.º 3

0

Mostrar archivo

Archivo: argos_spider.py Proyecto: lifelonglearner127/tmtext

 def _parse_category(self, response):
     category = response.meta['category']
     parent = response.meta.get('parent', {})
     category['catid'] = self._get_catid()
     category['url'] = response.url
     category['parent_text'] = parent.get('text')
     category['parent_url'] = parent.get('url')
     category['parent_catid'] = parent.get('catid')
     category['grandparent_text'] = parent.get('parent_text')
     category['grandparent_url'] = parent.get('parent_url')
     category['level'] = parent.get('level', 0) + 1
     category['department_text'] = response.meta['department']['text']
     category['department_url'] = response.meta['department']['url']
     category['department_id'] = response.meta['department']['catid']
     #category['description_text'] = self._description_text.first(response)
     description_text = first(response.xpath(self._xpath_description_text).extract())
     if description_text:
         category['description_wc'] = len(Utils.normalize_text(description_text))
     keywords = first(response.xpath(self._xpath_keywords).extract())
     if description_text:
         category['description_text'] = description_text
     if description_text and keywords:
         (category['keyword_count'], category['keyword_density']) = Utils.phrases_freq(keywords, description_text)
     if category.get('nr_products') is None:
         nr_products = re_find('\d+', first(response.css(self._css_product_numbers_text).extract()))
         category['nr_products'] = int(nr_products) if nr_products is not None else None
     subcategory_links = LinkExtractor(restrict_xpaths=self._xpath_category_links)
     for link in subcategory_links.extract_links(response):
         text, nr_products = re.search('(.+?) \((\d+)\) *', link.text).groups()
         nr_products = int(nr_products)
         child = CategoryItem(text=text, nr_products=nr_products)
         meta = {'category': child, 'department': response.meta['department'], 'parent': category}
         yield Request(link.url, callback=self._parse_category, meta=meta)
     yield category

Ejemplo n.º 4

0

Mostrar archivo

Archivo: sears_spider.py Proyecto: lifelonglearner127/tmtext

    def parse(self, response):

        hxs = HtmlXPathSelector(response)
        #TODO: add departments with no subcategories!!

        # get urls of pages for each category
        urls = hxs.select(
            "//div[@class='siteMapSubCell']//ul/li/a/@href").extract()

        # add departments to items
        departments = hxs.select("//div[@class='siteMapSubCell']//h4/a")
        items = []
        for department in departments:
            item = CategoryItem()
            item['text'] = department.select('text()').extract()[0]
            item['url'] = department.select('@href').extract()[0]
            item['level'] = 1
            items.append(item)

        # parse each page in urls list with parsePage
        # build urls by adding the prefix of the main page url
        first = True
        root_url = "http://www.sears.com/shc/s"
        for url in urls:
            request = Request(root_url + "/" + url, callback=self.parsePage)
            # send these only once (the first time)
            if first:
                request.meta['items'] = items
                first = False
            else:
                request.meta['items'] = []
            yield request

Ejemplo n.º 5

0

Mostrar archivo

Archivo: macys_spider.py Proyecto: Blas-P12/scrapy-project

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links = hxs.select("//div[@id='sitemap_header']/a")
        root_url = "http://www1.macys.com"

        department_id = 0

        for link in links:
            item = CategoryItem()

            text = link.select('text()').extract()[0]
            item['text'] = text
            # remove unnecessary suffix from URL
            url = link.select('@href').extract()[0]
            m = re.match("(.*\?id=[0-9]+)&?.*", url)
            if m:
                item['url'] = m.group(1)
            else:
                item['url'] = url
            item['level'] = 1

            # only yield this item after parsing its page and extracting additional info
            #yield item

            department_id += 1

            # create request to extract subcategories for this category
            yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : 1, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

Ejemplo n.º 6

0

Mostrar archivo

Archivo: staples_spider.py Proyecto: lifelonglearner127/tmtext

    def parseList(self, response):
        hxs = HtmlXPathSelector(response)

        items = []
        # add all department names
        departments = hxs.select("//div[@id='showallprods']/ul/li/a")

        root_url = "http://www.staples.com"

        department_id = 0

        for department in departments:
            item = CategoryItem()

            item['text'] = department.select("text()").extract()[0]
            item['url'] = root_url + department.select("@href").extract()[0]
            item['level'] = 1

            #yield item

            # # parse each department page for its categories, pass the department item too so that it's added to the list in parseDept
            # yield Request(item['url'], callback = self.parseDept, meta = {"department": item})

            department_id += 1

            zipcode = "12345"
            request = Request(item['url'], callback = self.parseDept, cookies = {"zipcode" : zipcode}, \
                headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, \
                "parent": item, "level": 1, \
                "department_text" : item["text"], "department_url" : item["url"], "department_id" : department_id})
            yield request

Ejemplo n.º 7

0

Mostrar archivo

Archivo: bloomingdales_spider.py Proyecto: lifelonglearner127/tmtext

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links = hxs.select(
            "//div[@class='sr_siteMap_container']/div[position()>2 and position()<5]//a"
        )
        root_url = "http://www1.bloomingdales.com"

        #TODO: add registry as special category?

        department_id = 0

        for link in links:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            url = link.select('@href').extract()[0]
            # if it's a relative URL complete it with the domain
            if not url.startswith("http"):
                url = root_url + url

            item['url'] = url
            item['level'] = 1

            department_id += 1

            item['department_text'] = item['text']
            item['department_url'] = item['url']
            item['department_id'] = department_id

            #yield item

            # create request to extract subcategories for this category
            yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, \
                "department_text" : item['text'], "department_url" : item['url'], "department_id" : department_id, \
                "dont_merge_cookies" : True}, \
                cookies = {"shippingCountry" : "US"}, headers = {"Cookie" : "shippingCountry=" + "US"})

Ejemplo n.º 8

0

Mostrar archivo

Archivo: newegg_spider.py Proyecto: Blas-P12/scrapy-project

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        category_links = hxs.select("//div[@class='itmNav']//a")

        # unique ids for departments
        department_id = 0

        for category_link in category_links:

            item = CategoryItem()
            category_name = category_link.select("text()").extract()
            if category_name:
                item['text'] = category_name[0]
            else:
                sys.stderr.write("Error: no name for category in element " + category_link.extract())
                continue
            item['url'] = self.clean_url(category_link.select("@href").extract()[0])
            # mark as department
            item['level'] = 1

            department_id += 1

            # mark it as its own department, will be passed on to its subcategories
            item['department_text'] = item['text']
            item['department_url'] = item['url']
            item['department_id'] = department_id

            #items.append(item)
            yield Request(url = item['url'], callback = self.parseCategory, meta = {"item" : item, \
                'department_text' : item['department_text'], 'department_url' : item['department_url'], 'department_id' : item['department_id']})

Ejemplo n.º 9

0

Mostrar archivo

Archivo: newegg_spider.py Proyecto: Blas-P12/scrapy-project

    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract number of products if available
        #TODO check
        count_holder = hxs.select("//div[@class='recordCount']/span[@id='RecordCount_1']/text()")
        if count_holder:
            item['nr_products'] = int(count_holder.extract()[0])

        #TODO
        # try to change URL "Category" to "SubCategory", see if you find the product count there

        # extract description if available
        description_holders = hxs.select("//div[@id='bcaShopWindowSEO']")
        # if the list is not empty and contains at least one non-whitespace item
        if description_holders:
            description_texts = description_holders.select(".//text()[not(ancestor::h2)]").extract()

            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()])

            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)

            description_title = description_holders.select(".//h2/text()").extract()
            if description_title:
                item['description_title'] = description_title[0].strip()

                (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        parent = item

        #TODO
        # extract and parse subcategories
        subcats = hxs.select("//dl[@class='categoryList primaryNav']/dd/a")
        for subcat in subcats:
            item = CategoryItem()
            
            item['text'] = subcat.select("text()").extract()[0].strip()

            #TODO: check out some huge URLs
            item['url'] = self.clean_url(subcat.select("@href").extract()[0])

            item['parent_text'] = parent['text']
            item['parent_url'] = parent['url']
            item['level'] = parent['level'] - 1
            item['department_text'] = response.meta['department_text']
            item['department_url'] = response.meta['department_url']
            item['department_id'] = response.meta['department_id']

            yield Request(url = item['url'], callback = self.parseCategory, meta = {"item" : item, \
                "department_text" : response.meta['department_text'], "department_url" : response.meta['department_url'], "department_id" : response.meta['department_id']})

Ejemplo n.º 10

0

Mostrar archivo

Archivo: sherwin_spider.py Proyecto: Blas-P12/scrapy-project

    def parseSubcategory(self, response):
        hxs = HtmlXPathSelector(response)

        subcategory = response.meta['item']

        # yield this subcategory
        yield subcategory

        # if subcategory was special, we'll mark all subsubcategories as special
        if 'special' in subcategory:
            special = True
        else:
            special = False

        # get its subcategories
        subsubcategories = hxs.select(
            "//div[@class='product-category-expanded']//h3[@class='title']")

        for subsubcategory in subsubcategories:
            item = CategoryItem()
            item['text'] = subsubcategory.select("a/text()").extract()[0]
            item['url'] = Utils.add_domain(
                subsubcategory.select("a/@href").extract()[0], self.base_url)

            if special:
                item['special'] = 1

            item['parent_text'] = subcategory['text']
            item['parent_url'] = subcategory['url']
            item['department_text'] = subcategory['department_text']
            item['department_url'] = subcategory['department_url']
            item['department_id'] = subcategory['department_id']

            item['level'] = subcategory['level'] - 1

            description_text_holder = subsubcategory.select(
                "following-sibling::p[@class='description'][1]/text()"
            ).extract()
            if description_text_holder:
                item['description_text'] = description_text_holder[0]
                item['description_title'] = item['text']
                description_tokenized = Utils.normalize_text(
                    item['description_text'])
                item['description_wc'] = len(description_tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

            # parse subcategory page to get product count, or further subsubcategory
            yield Request(item['url'],
                          callback=self.parseSubcategoryPage,
                          meta={'item': item})

Ejemplo n.º 11

0

Mostrar archivo

 def _parse_category(self, response):
     category = response.meta['category']
     self._populate_category(response)
     classifications = self._scrape_classifications(response)
     categories = classifications.pop("Shop By Category", [])
     for url, text, nr_products in categories:
         new_category = CategoryItem(text=text, nr_products=nr_products)
         yield Request(url, self._parse_category, meta={"category": new_category, "parent": category})
     if category.get('nr_products') is None:
         category['nr_products'] = sum((item[2] for item in categories))
     category['classification'] = {key: [{'name': itm[1], 'nr_products': itm[2]} for itm in value]
                                    for key,value in classifications.iteritems()}
     yield category

Ejemplo n.º 12

0

Mostrar archivo

Archivo: bjs_spider.py Proyecto: Blas-P12/scrapy-project

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        # extract all bottom level categories
        links = hxs.select("//div[@class='links']//a")
        # extract parent categories
        parent_links = hxs.select("//div[@class='header_no_icon']")
        items = []

        for link in links:

            # extract parent category
            parent = link.select(
                "parent::node()/parent::node()/parent::node()/div/div[position()=1]/a"
            )

            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            parent_text = parent.select('h2/text()').extract()
            parent_url = parent.select('@href').extract()
            if parent_text:
                item['parent_text'] = parent_text[0]
            if parent_url:
                item['parent_url'] = parent_url[0]

            item['level'] = 0

            items.append(item)

        for parent in parent_links:
            item = CategoryItem()
            item['text'] = parent.select('a/h2/text()').extract()[0]
            item['url'] = parent.select('a/@href').extract()[0]
            item['level'] = 1

            items.append(item)

        return items

Ejemplo n.º 13

0

Mostrar archivo

Archivo: tigerdirect_spider.py Proyecto: lifelonglearner127/tmtext

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links = hxs.select("//table//tr[1]/td//a[ancestor::h4]")

        department_id = 0

        for link in links:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]
            item['level'] = 1

            department_id += 1
            item['department_text'] = item['text']
            item['department_url'] = item['url']
            item['department_id'] = department_id

            yield Request(url = item['url'], callback = self.parseCategory, meta = {'item' : item,\
             'department_url' : item['department_url'], 'department_text' : item['department_text'], 'department_id' : department_id})

Ejemplo n.º 14

0

Mostrar archivo

Archivo: bloomingdales_spider.py Proyecto: lifelonglearner127/tmtext

    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)
        parent = response.meta['parent']

        # extract product count if any
        product_count_holder = hxs.select(
            "//span[@class='productCount'][1]/text()").extract()
        if product_count_holder:
            parent['nr_products'] = int(product_count_holder[0])

        # extract description if any
        # just assume no description (haven't found any page with descriptions for bloomingdales)
        parent['description_wc'] = 0

        # yield parent item (if it hasn't been output before)
        if 'parent_url' not in parent or (parent['url'], parent['parent_url'],
                                          parent['level']) not in self.crawled:
            if 'parent_url' in parent:
                self.crawled.append(
                    (parent['url'], parent['parent_url'], parent['level']))
            yield parent

        # extract subcategories
        subcats = hxs.select("//div[@class='gn_left_nav2_standard']//a")
        for subcat in subcats:
            item = CategoryItem()
            item['text'] = subcat.select('text()').extract()[0]
            item['url'] = subcat.select('@href').extract()[0]
            item['level'] = parent['level'] - 1
            item['parent_text'] = response.meta['parent']['text']
            item['parent_url'] = response.url
            item['department_text'] = response.meta['department_text']
            item['department_url'] = response.meta['department_url']
            item['department_id'] = response.meta['department_id']

            # create request to extract subcategories for this category
            yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, \
                "department_text" : item['department_text'], "department_url" : item['department_url'], "department_id" : item['department_id'], \
                "dont_merge_cookies" : True}, \
                cookies = {"shippingCountry" : "US"}, headers = {"Cookie" : "shippingCountry=" + "US"})

Ejemplo n.º 15

0

Mostrar archivo

Archivo: walmartca_spider.py Proyecto: lifelonglearner127/tmtext

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

Ejemplo n.º 16

0

Mostrar archivo

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links_level1 = hxs.select("//div[@id='siteDirectory']//table//a")
        titles_level1 = hxs.select("//div//table//h2")

        # add level 1 categories to items

        # first one is a special category ("Unlimited Instant Videos"), add it separately
        special_item = CategoryItem()
        special_item['text'] = titles_level1[0].select('text()').extract()[0]
        special_item['level'] = 2
        special_item['special'] = 1
        special_item['department_text'] = special_item['text']
        special_item['department_id'] = self.department_count
        self.departments_ids[special_item['text']] = special_item['department_id']
        self.department_count += 1


        #items.append(special_item)
        yield special_item

        # the rest of the titles are not special
        for title in titles_level1[1:]:
            item = CategoryItem()
            item['text'] = title.select('text()').extract()[0]
            item['level'] = 2
            item['department_text'] = item['text']
            item['department_id'] = self.department_count
            self.departments_ids[item['text']] = item['department_id']
            self.department_count += 1

            # if item is found among extra_toplevel_categories_urls, add info from that url
            extra_category = self.find_matching_key(item['text'], self.extra_toplevel_categories_urls)
            if extra_category:
                item['url'] = self.extra_toplevel_categories_urls[extra_category]
                item['department_url'] = item['url']
                self.department_urls[item['text']] = item['url']

                # collect number of products from this alternate URL
                yield Request(item['url'], callback = self.extract_nrprods_and_subcats, meta = {'item' : item})

            else:
                yield item

        # add level 1 categories to items
        for link in links_level1:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            root_url = "http://www.amazon.com"
            item['url'] = root_url + link.select('@href').extract()[0]
            item['level'] = 1

            parent = link.select("parent::node()/parent::node()/preceding-sibling::node()")
            parent_text = parent.select('text()').extract()

            # category should have a parent (its department) and that parent should have been extracted earlier (above) and put in the ids dictionary, necessary for getting the department id
            assert parent_text
            assert parent_text[0] in self.departments_ids
            if parent_text:
                item['parent_text'] = parent_text[0]
                item['department_text'] = item['parent_text']
                item['department_id'] = self.departments_ids[item['department_text']]

                # get department url from department_urls, will be availble only for extra_categories
                if item['department_text'] in self.department_urls:
                    assert self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls)
                    item['department_url'] = self.department_urls[item['department_text']]

                # if its parent is the special category, mark this one as special too
                if (item['parent_text'] == special_item['text']):
                    item['special'] = 1
                    special = True
                else:
                    special = False

            # department_id = self.department_count
            # self.department_count += 1

            # item['department_text'] = item['text']
            # item['department_url'] = item['url']
            # item['department_id'] = department_id

            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

Ejemplo n.º 17

0

Mostrar archivo

Archivo: walmartca_spider.py Proyecto: lifelonglearner127/tmtext

class WalmartCaSpider(BaseSpider):
    name = "walmartca"
    allowed_domains = ["walmart.ca"]
    start_urls = [
        "http://www.walmart.ca/en",
    ]

    def __init__(self, outfile=None):
        self.root_url = "http://www.walmart.ca"
        self.outfile = outfile

        # set flag that indicates that for this spider, nr of products for each catgory should be computed
        self.compute_nrproducts = True

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 1

        # keep crawled items represented by (url, parent_url, department_url) pairs
        # to eliminate duplicates
        # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled)
        self.crawled = []

        # last used category id, used for autoincrementing ids idenrifying categories
        self.id_count = 0

        # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page
        self.special_itemcount = {'value of the day': 2}

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

    # parse category page and extract description and number of products
    def parseCategory(self, response):

        # URLs like health.walmart.com don't have body_as_unicode and generate an exception
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError, e:
            self.log("Could not get response from " + response.url +
                     "; original exception: " + str(e) + "\n",
                     level=log.WARNING)
            return

        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))

        # set item count for special items (hardcoded in special_itemcount)
        if item['text'].lower() in self.special_itemcount:
            item['nr_products'] = self.special_itemcount[item['text'].lower()]

        # Extract subcategories if no product count found
        if 'nr_products' in item:
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item

Ejemplo n.º 18

0

Mostrar archivo

 def parse(self, response):
     for url, text in self._scrape_department_links(response):
         category = CategoryItem(text=text)
         yield Request(url, callback=self._parse_category, meta={"category": category})

Ejemplo n.º 19

0

Mostrar archivo

Archivo: sherwin_spider.py Proyecto: Blas-P12/scrapy-project

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        # extract departments
        departments = hxs.select("//h2")
        department_id = 0
        for department in departments:
            department_item = CategoryItem()
            department_text = department.select("text()").extract()[0]

            department_item['department_text'] = department_text

            # #TODO: add department_url, from sherwin-williams.com ...? get department list from there and match with departments from here by seeing if names match

            department_item['department_id'] = department_id

            department_item['text'] = department_text

            department_item['level'] = 1

            # get categories in department
            categories = department.select("following-sibling::ul[1]/li")

            # extract department url from one of its categories urls (it's not available directly)
            category_ex = categories[0]
            category_ex_url = Utils.add_domain(
                category_ex.select("a/@href").extract()[0], self.base_url)
            # extract first part of url
            m = re.match("(http://www.sherwin\-williams\.com/[^/]+)/.*",
                         category_ex_url)
            department_url = m.group(1)
            department_item['department_url'] = department_url
            department_item['url'] = department_url

            for category in categories:
                item = CategoryItem()
                #TODO: special if 'Services'? or Specifications, or Ads...
                category_text = category.select("a/text()").extract()[0]
                category_url = Utils.add_domain(
                    category.select("a/@href").extract()[0], self.base_url)
                item['text'] = category_text
                item['url'] = category_url

                # if it's not a 'products' category, mark it and all its subcategories as special

                if category_text != 'Products':
                    item['special'] = 1
                    special = True
                else:
                    special = False

                item['department_id'] = department_id
                item['department_text'] = department_text
                item['department_url'] = department_url

                item['parent_text'] = department_text
                item['parent_url'] = department_url

                item['level'] = 0

                #TODO: do we need description_wc here as well?

                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

                # get subcategories in category
                subcategories = category.select("ul/li")
                for subcategory in subcategories:
                    item = CategoryItem()

                    item['text'] = subcategory.select("a/text()").extract()[0]
                    item['url'] = Utils.add_domain(
                        subcategory.select("a/@href").extract()[0],
                        self.base_url)

                    item['department_id'] = department_id
                    item['department_text'] = department_text
                    item['department_url'] = department_url

                    item['parent_text'] = category_text
                    item['parent_url'] = category_url

                    item['level'] = -1

                    # if parent is special, category is special
                    if special:
                        item['special'] = 1

                    yield Request(item['url'],
                                  callback=self.parseSubcategory,
                                  meta={'item': item})

            department_id += 1

            # return department
            yield department_item

Ejemplo n.º 20

0

Mostrar archivo

    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # get parent item from response, extract additional info and return it
        item = response.meta['parent']

        # add department name, url and id for item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract product count if available
        nr_items_holder = hxs.select(
            "//div[@id='showing']/strong[position()=2]/text()").extract()
        if nr_items_holder:
            item['nr_products'] = int(str(nr_items_holder[0]))

        # extract description if available
        # these are descriptions for  services pages
        desc_title_holder = hxs.select(
            "//div[@id='searchstate']/a[position()=2]/text()").extract()
        if desc_title_holder:
            item['description_title'] = desc_title_holder[0].strip()
        desc_content_holder = hxs.select(
            "//div[@class='content']/h3/text()").extract()
        if desc_content_holder:
            item['description_text'] = desc_content_holder[0].strip()
            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)
            (item['keyword_count'],
             item['keyword_density']) = Utils.phrases_freq(
                 item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        # extract its subcategories
        #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']")
        subcats_holders = hxs.select(
            "//div[@class='narrowcontent']/ul[contains(@class,'search')]")
        if subcats_holders:
            subcats_holder = subcats_holders[0]
            # these are subcategories if they are preceded by the title "Shop ..."
            title = subcats_holder.select(
                "parent::node()/preceding-sibling::node()//text()").extract(
                )[0]
            if str(title).startswith("Shop"):
                subcats = subcats_holder.select(".//li/a")
                for subcat in subcats:
                    item = CategoryItem()
                    item['text'] = subcat.select("text()").extract()[0].strip()
                    item['url'] = Utils.add_domain(
                        subcat.select("@href").extract()[0],
                        "http://www.bestbuy.com")
                    parent = response.meta['parent']
                    item['level'] = int(response.meta['level']) - 1
                    # if parent was special, this category is special too
                    if 'special' in parent:
                        item['special'] = 1
                    item['parent_text'] = parent['text']
                    item['parent_url'] = parent['url']

                    request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                        'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                    yield request

Ejemplo n.º 21

0

Mostrar archivo

    def parse(self, response):
        # currently not extracting parents that are non-links (smaller parent categories like "resources" and "shops")
        hxs = HtmlXPathSelector(response)

        # # select all categories (bottom level)
        # product_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//li/a")

        # select all parent categories
        #parent_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//h4/a")
        parent_links = hxs.select(
            "//div[@id='container']/div[@id='header']//nav/ul[@id='nav-touch']//h4/a"
        )

        #TODO: add extraction of level 3 categories (broadest: products, services,...)

        # items = []

        #############################################
        # Extract all categories from sitemap instead of menus on category landing pages (lower level as well)

        # for link in product_links:

        #     # retrieve parent category for this link
        #     parent = link.select("parent::node()/parent::node()/preceding-sibling::node()/a")
        #     item = CategoryItem()

        #     item['text'] = link.select('text()').extract()[0]
        #     item['url'] = link.select('@href').extract()[0]

        #     parent_text = parent.select('text()').extract()
        #     parent_url = parent.select('@href').extract()
        #     if parent_text:
        #         item['parent_text'] = parent_text[0]
        #     if parent_url:
        #         item['parent_url'] = parent_url[0]

        #     # mark it as special if a certain condition is checked
        #     if (link.select("parent::node()/parent::*[@class='nav-res']")):
        #         item['special'] = 1
        #     #TODO: add its direct parent if it's special (not among the categories). ex: shops, resources...

        #     # get grandparent of the category, mark item as special if grandparent is special
        #     grandparent = parent.select("parent::node()/parent::node()/parent::node()/parent::node()")
        #     if not grandparent.select('@class') or grandparent.select('@class').extract()[0] != 'nav-pro':
        #         item['special'] = 1

        #     grandparent_text = grandparent.select('a/text()').extract()
        #     grandparent_url = grandparent.select('a/@href').extract()
        #     if grandparent_text:
        #         item['grandparent_text'] = grandparent_text[0]
        #     if grandparent_url:
        #         item['grandparent_url'] = grandparent_url[0]

        #     item['level'] = 0

        #     items.append(item)
        ###############################################################

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1
            parent = link.select(
                "parent::node()/parent::node()/parent::node()/parent::node()")

            # mark item as special if its parent is special
            if not parent.select('@class').extract() or parent.select(
                    '@class').extract()[0] != "nav-pro":
                item['special'] = 1

            parent_text = parent.select('a/text()').extract()
            # they don't actually have a url, only a #
            #parent_url = parent.select('a/@href').extract()
            if parent_text:
                item['parent_text'] = parent_text[0]

            # if parent_url:
            #     item['parent_url'] = parent_url[0]

            #items.append(item)
            department_id += 1
            request = Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : 1, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})
            yield request

Ejemplo n.º 22

0

Mostrar archivo

Archivo: staples_spider.py Proyecto: lifelonglearner127/tmtext

    def parseDept(self, response):

        # for "copy & print" there's an exception, we don't need zipcode

        # # use selenium to complete the zipcode form and get the first results page
        # driver = webdriver.Firefox()
        # driver.get(response.url)

        # # set a hardcoded value for zipcode
        # zipcode = "12345"

        # textbox = driver.find_element_by_name("zipCode")
        # textbox.send_keys(zipcode)

        # button = driver.find_element_by_id("submitLink")
        # button.click()

        # cookie = {"zipcode": zipcode}
        # driver.add_cookie(cookie)

        # time.sleep(5)

        # # convert html to "nice format"
        # text_html = driver.page_source.encode('utf-8')
        # #print "TEXT_HTML", text_html
        # html_str = str(text_html)

        # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
        # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)

        # hxs = HtmlXPathSelector(resp_for_scrapy)

        #TODO: doesn't extract Televisions for ex

        hxs = HtmlXPathSelector(response)
        categories = hxs.select("//h2/a")

        root_url = "http://www.staples.com"

        # from parent's page:
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items, if any
        nritems_holder = hxs.select(
            "//div[@class='perpage']/span[@class='note']/text()").extract()
        if nritems_holder:
            m = re.findall("[0-9]+\s*items", nritems_holder[0])
            if m:
                item['nr_products'] = int("".join(re.findall("[0-9]+", m[0])))
            # else:
            #     print "NOT MATCH ", nritems_holder[0]

        # extract description, if any
        description_texts = hxs.select(
            "//h2[@class='seo short']//text() | //h2[@class='seo short long']//text()"
        ).extract()
        if description_texts and reduce(
                lambda x, y: x or y,
            [line.strip() for line in description_texts]):
            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            if item['description_text']:
                item['description_title'] = item['text']

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                # if no description is found
                #print 'desc_holder but no desc_text ', response.URL
                item['description_wc'] = 0
        else:
            item['description_wc'] = 0

        # yield item the request came from (parent)
        yield item

        # extract subcategories
        for category in categories:
            # there are pages that don't have categories
            item = CategoryItem()
            text = category.select("text()").extract()
            if text:
                item['text'] = text[0]
            url = category.select("@href").extract()
            if url:
                item['url'] = root_url + url[0]
            item['level'] = int(response.meta['level'] - 1)
            if 'text' in response.meta['parent']:
                item['parent_text'] = response.meta['parent']['text']
            else:
                print 'no text in parent ', response.meta['parent']
            item['parent_url'] = response.url

            # yield the item after passing it through request and collecting additonal info
            #yield item

            # extract subcategories if any
            zipcode = "12345"
            request = Request(item['url'], callback = self.parseDept, cookies = {"zipcode" : zipcode}, \
                headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, \
                "parent": item, "level": item['level'], \
                "department_text" : response.meta["department_text"], "department_url" : response.meta["department_url"], "department_id" : response.meta["department_id"]})
            yield request

Ejemplo n.º 23

0

Mostrar archivo

Archivo: macys_spider.py Proyecto: Blas-P12/scrapy-project

    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # output received parent element after extracting additional info
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items if available
        prod_count_holder = hxs.select(
            "//span[@id='productCount']/text()").extract()
        if prod_count_holder:
            item['nr_products'] = int(prod_count_holder[0].strip())
        # exract description if available
        desc_holder = hxs.select("//div[@id='catalogCopyBlock']")
        if desc_holder:
            item['description_title'] = desc_holder.select(
                "h2/text()").extract()[0]
            description_texts = desc_holder.select("p/text()").extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])
            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        yield item

        chapters = hxs.select("//li[@class='nav_cat_item_bold']")

        for chapter in chapters:

            #TODO: still includes some special categories (like "Coming Soon" in men)
            # exclude "Brands" chapter
            chapter_name = chapter.select("span/text()").extract()
            if not chapter_name or "brands" in chapter_name[0]:
                continue

            subcats = chapter.select("ul/li/a")
            for subcat in subcats:
                item = CategoryItem()
                text = subcat.select('text()').extract()[0]
                # if it starts with "Shop all", ignore it
                if re.match("Shop [aA]ll.*", text):
                    continue
                else:
                    item['text'] = text
                # remove unnecessary suffix from URL
                url = subcat.select('@href').extract()[0]
                m = re.match("(.*\?id=[0-9]+)&?.*", url)
                if m:
                    item['url'] = m.group(1)
                else:
                    item['url'] = url
                item['level'] = int(response.meta['level']) - 1
                item['parent_text'] = response.meta['parent']['text']
                item['parent_url'] = response.url

                #yield item

                yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                    'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})

Ejemplo n.º 24

0

Mostrar archivo

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.parse
            )  # meta of response will contain number of retries left if set
            return

        links_level1 = hxs.select("//div[@id='siteDirectory']//table//a")
        titles_level1 = hxs.select("//div//table//h2")

        # add level 1 categories to items

        # first one is a special category ("Unlimited Instant Videos"), add it separately
        special_item = CategoryItem()
        special_item['text'] = titles_level1[0].select('text()').extract()[0]
        special_item['level'] = 2
        special_item['special'] = 1
        special_item['department_text'] = special_item['text']
        special_item['department_id'] = self.department_count
        self.department_count += 1

        special_item['catid'] = self.catid
        self.catid += 1

        self.departments_ids[
            special_item['text']] = special_item['department_id']
        self.departments_cat_ids[special_item['text']] = special_item['catid']

        #yield special_item

        # if test category is set, and this is not it, ignore
        if not self.test_category or special_item['text'] == self.test_category:
            yield special_item

        # the rest of the titles are not special
        for title in titles_level1[1:]:
            item = CategoryItem()
            item['text'] = title.select('text()').extract()[0]
            item['level'] = 2
            item['department_text'] = item['text']
            item['department_id'] = self.department_count
            self.department_count += 1

            item['catid'] = self.catid
            self.catid += 1

            self.departments_ids[item['text']] = item['department_id']
            self.departments_cat_ids[item['text']] = item['catid']

            # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, add info from that url
            extra_category = self.find_matching_key(
                item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS)
            if extra_category:
                item['url'] = self.EXTRA_TOPLEVEL_CATEGORIES_URLS[
                    extra_category]
                item['department_url'] = item['url']
                self.department_urls[item['text']] = item['url']

                # if self.test_category is set, only send request if this is the test category
                if self.test_category and item['text'] != self.test_category:
                    continue

                # parse this category further
                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

            else:
                # if test category is set and this is not it, ignore
                if self.test_category and item['text'] != self.test_category:
                    continue

                yield item

        # add level 1 categories to items
        for link in links_level1:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            root_url = "http://www.amazon.com"
            item['url'] = root_url + link.select('@href').extract()[0]
            item['level'] = 1

            parent = link.select(
                "parent::node()/parent::node()/preceding-sibling::node()")
            parent_text = parent.select('text()').extract()

            # category should have a parent (its department) and that parent should have been extracted earlier (above) and put in the ids dictionary, necessary for getting the department id
            assert parent_text
            assert parent_text[0] in self.departments_ids
            if parent_text:
                item['parent_text'] = parent_text[0]
                item['department_text'] = item['parent_text']
                item['department_id'] = self.departments_ids[
                    item['department_text']]
                item['parent_catid'] = self.departments_cat_ids[
                    item['department_text']]
                item['catid'] = self.catid
                self.catid += 1

                # get department url from department_urls, will be availble only for extra_categories
                if item['department_text'] in self.department_urls:
                    assert self.find_matching_key(
                        item['department_text'],
                        self.EXTRA_TOPLEVEL_CATEGORIES_URLS)
                    item['department_url'] = self.department_urls[
                        item['department_text']]
                    item['parent_url'] = item['department_url']

                    #TODO: leave this or not?
                    # Don't crawl subcategories of departments twice. If this is a department with url (extra_category), then we will crawl its subcategories. So ignore them here
                    #continue

                # if its parent is the special category, mark this one as special too
                if (item['parent_text'] == special_item['text']):
                    item['special'] = 1
                    special = True
                else:
                    special = False

            # department_id = self.department_count
            # self.department_count += 1

            # item['department_text'] = item['text']
            # item['department_url'] = item['url']
            # item['department_id'] = department_id

            # if self.test_category is set, only send request if this is the test category
            if self.test_category and item['text'] != self.test_category:
                continue

            yield Request(item['url'],
                          callback=self.parseCategory,
                          meta={'item': item})

Ejemplo n.º 25

0

Mostrar archivo

    def extractSubcategories(self, response):

        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.extractSubcategories
            )  # meta of response will contain number of retries left if set
            return

        hxs = HtmlXPathSelector(response)

        # returned received item, then extract its subcategories
        parent_item = response.meta['item']

        yield parent_item

        # extract subcategories, if level is above barrier
        # extract subcategories from first menu on the left, assume this is the subcategories menu

        if parent_item['level'] > self.LEVEL_BARRIER:

            # check if it should be treated as a special category (exceptions to usual page structure); then extract the subcategories with the appropriate method
            if self.isSpecialCategoryMenu(parent_item):
                subcategories = self.extractSubcategoriesFromMenuSpecial(
                    hxs, parent_item)

                # if no subcategories were found, try with the regular extraction as well (ex http://www.amazon.com/clothing-accessories-men-women-kids/b/ref=sd_allcat_apr/179-7724806-1781144?ie=UTF8&node=1036592)
                if not subcategories:
                    subcategories = self.extractSubcategoriesFromMenu(hxs)

            else:
                subcategories = self.extractSubcategoriesFromMenu(hxs)

            for (subcategory_text, subcategory_url,
                 subcategory_prodcount) in subcategories:

                item = CategoryItem()
                item['url'] = subcategory_url
                item['text'] = subcategory_text
                item['catid'] = self.catid
                self.catid += 1

                if subcategory_prodcount:
                    item['nr_products'] = int(subcategory_prodcount)

                item['parent_text'] = parent_item['text']
                item['parent_url'] = parent_item['url']
                item['parent_catid'] = parent_item['catid']

                # considering departments to be level 2 categories (top level) - so every category must have a department text
                assert 'department_text' in parent_item
                if 'department_text' in parent_item:
                    item['department_text'] = parent_item['department_text']
                    #item['department_url'] = parent_item['department_url']
                    item['department_id'] = parent_item['department_id']

                # only level 2 categories in extra_categories have department_url
                if 'department_url' in parent_item:
                    item['department_url'] = parent_item['department_url']
                else:
                    assert not self.find_matching_key(
                        item['department_text'],
                        self.EXTRA_TOPLEVEL_CATEGORIES_URLS)

                # else:
                #     # the parent must be a level 2 category - so this will be considered department
                #     assert parent_item['level'] == 2
                #     item['department_text'] = item['text']
                #     #item['department_url'] = item['url']
                #     item['department_id'] = self.department_count
                #     self.department_count += 1

                item['level'] = parent_item['level'] - 1

                # # no description extracted
                # item['description_wc'] = 0

                # send to parseCategory to extract description as well
                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

Ejemplo n.º 26

0

Mostrar archivo

Archivo: toysrus_spider.py Proyecto: lifelonglearner127/tmtext

    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        # selecting main level categories
        links = hxs.select("//div[@id='sitemapLinks']/ul/li/ul/li/a")

        # selecting low level categories
        low_links = hxs.select("//div[@id='sitemapLinks']/ul/li/ul/li/ul/li/a")

        # selecting lower level categories
        lower_links = hxs.select("//div[@id='sitemapLinks']/ul/li/ul/li/ul/li/ul/li/a")

        # selecting departments
        departments = hxs.select("//div[@id='sitemapLinks']/ul/li/a")

        items = []

        root_url = "http://www.toysrus.com"

        # add categories
        for link in links:

            # extract immediate parent
            parent = link.select("parent::node()/parent::node()/parent::node()/a")

            item = CategoryItem()

            # add the current page url as a field
            item['page_url'] = response.url

            item['text'] = link.select('text()').extract()[0]
            # build this into an absolute url by removing ".." prefix and adding domain
            item['url'] = root_url + link.select('@href').extract()[0][2:]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = root_url + parent.select('@href').extract()[0][2:]

            # this is the main level of categories
            item['level'] = 0

            items.append(item)


        # add subcategories
        for link in low_links:

            # extract immediate parent
            parent = link.select("parent::node()/parent::node()/parent::node()/a")

            item = CategoryItem()

            # add the current page url as a field
            item['page_url'] = response.url

            item['text'] = link.select('text()').extract()[0]
            # build this into an absolute url by removing ".." prefix and adding domain
            item['url'] = root_url + link.select('@href').extract()[0][2:]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = root_url + parent.select('@href').extract()[0][2:]

            # extract grandparent
            grandparent = parent.select("parent::node()/parent::node()/parent::node()/a")
            item['grandparent_text'] = grandparent.select('text()').extract()[0]
            item['grandparent_url'] = root_url + grandparent.select('@href').extract()[0][2:]

            # these are subcategories
            item['level'] = -1

            items.append(item)

        # add subsubcategories
        for link in lower_links:

            # extract immediate parent
            parent = link.select("parent::node()/parent::node()/parent::node()/a")

            item = CategoryItem()

            # add the current page url as a field
            item['page_url'] = response.url

            item['text'] = link.select('text()').extract()[0]
            # build this into an absolute url by removing ".." prefix and adding domain
            item['url'] = root_url + link.select('@href').extract()[0][2:]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = root_url + parent.select('@href').extract()[0][2:]

            # extract grandparent
            grandparent = parent.select("parent::node()/parent::node()/parent::node()/a")
            item['grandparent_text'] = grandparent.select('text()').extract()[0]
            item['grandparent_url'] = root_url + grandparent.select('@href').extract()[0][2:]

            # these are subsubcategories
            item['level'] = -2

            items.append(item)

        # add departments
        for link in departments:

            item = CategoryItem()

            # add the current page url as a field
            item['page_url'] = response.url

            item['text'] = link.select('text()').extract()[0]
            # build this into an absolute url by removing ".." prefix and adding domain
            item['url'] = root_url + link.select('@href').extract()[0][2:]

            # these are departments
            item['level'] = 1

            # if it starts with "Save " ("Save 50% on ...") or "Buy " or contains "...% off" or starts with a date (number/number), 
            # mark it as special
            #TODO: there are still some "Save up to..." items
            special = re.match("(Save .*)|(Buy .*)|(.*[0-9]+\% off.*)|(.*[0-9]+/[0-9]+.*)", item['text'], re.UNICODE)
            if special:
                item['special'] = 1
            items.append(item)


        return items

Ejemplo n.º 27

0

Mostrar archivo

Archivo: sears_spider.py Proyecto: lifelonglearner127/tmtext

    def parsePage(self, response):
        # currently selects only lowest level links, and their parents inside their fields
        hxs = HtmlXPathSelector(response)

        #TODO: add special categories if any

        # select lowest level categories
        links = hxs.select("//div[@class='siteMapSubCat']/ul/li/a")
        # select parent categories
        parent_links = hxs.select("//div[@class='siteMapSubCat']/h4/a")

        # extract page name by getting text in url after "=" symbol
        # example url: smv_10153_12605?vName=Appliances

        page_name = response.url.split("=")[1]

        # get partial list from previous function (containing departments)
        items = response.meta['items']
        root_url = "http://www.sears.com"

        for link in links:
            item = CategoryItem()
            item['page_text'] = page_name
            item['page_url'] = response.url
            # add the page as the grandparent category
            item['grandparent_text'] = page_name
            item['grandparent_url'] = response.url

            # extract parent category element
            parent = link.select(
                "./parent::node()/parent::node()/preceding-sibling::node()[2]/a"
            )
            parent_text = parent.select('text()').extract()
            parent_url = parent.select('@href').extract()
            if parent_text and parent_url:
                item['parent_text'] = parent_text[0]
                item['parent_url'] = root_url + parent_url[0]
            # if you can't find a parent here go to first of the previous columns that has a parent
            else:
                parent = link.select(
                    "./parent::node()/parent::node()/parent::node()/preceding-sibling::node()[2]/h4[last()]"
                )
                parent_text = parent.select("a/text()").extract()
                parent_url = parent.select("a/@href").extract()
                index = 3
                while not parent_text:
                    parent = link.select(
                        "./parent::node()/parent::node()/parent::node()/preceding-sibling::node()[%d]/h4[last()]"
                        % index)
                    index += 1
                    parent_text = parent.select("a/text()").extract()
                    parent_url = parent.select("a/@href").extract()
                item['parent_text'] = parent_text[0]
                item['parent_url'] = root_url + parent_url[0]

            item['text'] = link.select('text()').extract()[0]
            item['url'] = root_url + link.select('@href').extract()[0]

            # these are subcategories
            item['level'] = -1
            items.append(item)

        for link in parent_links:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = root_url + link.select('@href').extract()[0]

            item['page_text'] = page_name
            item['page_url'] = response.url
            # add the page as the parent category
            item['parent_text'] = page_name
            item['parent_url'] = response.url
            # this is considered to be the main category level
            item['level'] = 0

            items.append(item)

        # # add the page name as a department
        # item = CategoryItem()
        # item['text'] = page_name
        # item['url'] = response.url
        # item['page_text'] = page_name
        # item['page_url'] = response.url
        # item['level'] = 1

        # items.append(item)

        return items

Ejemplo n.º 28

0

Mostrar archivo

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        # select category links on all 3 levels
        links = hxs.select("//div[@class='categories']/ul/li/ul/li/ul/li/a")
        parent_links = hxs.select("//div[@class='categories']/ul/li/ul/li/a")
        grandparent_links = hxs.select("//div[@class='categories']/ul/li/a")

        # select special section "browse by brand"
        special_links = hxs.select("//div[@class='brands']/ul/li/ul/li/a")
        special_parent_links = hxs.select("//div[@class='brands']/ul/li/a")
        items = []

        for link in links:
            # extracting parents
            parent = link.select('parent::node()/parent::node()/parent::node()/a')
            # extracting grandparents
            grandparent = parent.select('parent::node()/parent::node()/parent::node()/a')
            
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['parent_text'] = parent.select('text()').extract()[0]
            item['parent_url'] = parent.select('@href').extract()[0]

            grandparent_text = grandparent.select('text()').extract()
            grandparent_url = grandparent.select('@href').extract()
            if grandparent_text:
                item['grandparent_text'] = grandparent_text[0]
            if grandparent_url:
                item['grandparent_url'] = grandparent_text[0]

            # this is considered more detailed than the main category level (compared to other sitemaps)
            item['level'] = -1

            items.append(item)

        for link in parent_links:
            # extracting parents
            parent = link.select('parent::node()/parent::node()/parent::node()/a')
            
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            parent_text = parent.select('text()').extract()
            parent_url = parent.select('@href').extract()
            if (parent_text):
                item['parent_text'] = parent_text[0]
            if (parent_url):
                item['parent_url'] = parent_url[0]

            # this is considered the main category level
            item['level'] = 0

            items.append(item)

        # last 2 categories need to be marked as special
        for link in grandparent_links[:-2]:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1

            items.append(item)

        for link in grandparent_links[-2:]:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1
            item['special'] = 1

            items.append(item)


        for link in special_links:
            # extracting parents
            parent = link.select('parent::node()/parent::node()/parent::node()/a')

            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            parent_text = parent.select('text()').extract()
            parent_url = parent.select('@href').extract()
            if (parent_text):
                item['parent_text'] = parent_text[0]
            if (parent_url):
                item['parent_url'] = parent_url[0]

            # this is considered the main category level
            item['level'] = 0
            item['special'] = 1

            items.append(item)

        for link in special_parent_links:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1
            item['special'] = 1

            items.append(item)

        return items

Ejemplo n.º 29

0

Mostrar archivo

    def extract_nrprods_and_subcats(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract nr_products if not already extracted. necessary for extra_categories
        if 'nr_products' not in item:
            prod_count_holder = hxs.select("//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                #print "DIDN'T HAVE PRODUCT COUNT", response.url
                prod_count = prod_count_holder[0]
                # extract number
                m = re.match(".*\s*of\s*([0-9,]+)\s*Results\s*", prod_count)
                if m:
                    item['nr_products'] = int(re.sub(",","",m.group(1)))

        yield item

        parent_item = item

        # extract subcategories, if level is above barrier
        # currently extracting subcategories for categories on any level, for level 2 this may cause duplicates (we already extract level 1)
        # extract subcategories from first menu on the left, assume this is the subcategories menu
        #TODO: test or make more robust

        if item['level'] > self.LEVEL_BARRIER:
            subcategories = hxs.select("//h2[1]/following-sibling::ul[1]/li/a")
            for subcategory in subcategories:
                # if we have a subcategory URL and product count with the expected format extract it, otherwise move on
                if not subcategory.select("span[@class='refinementLink']"):
                    continue
                subcategory_url = Utils.add_domain(subcategory.select("@href").extract()[0], "http://www.amazon.com")
                subcategory_text = subcategory.select("span[@class='refinementLink']//text()").extract()[0].strip()
                # extract product count, clean it of commas and parantheses
                subcategory_prodcount_holder = subcategory.select("span[@class='narrowValue']/text()").extract()
                if not subcategory_prodcount_holder:
                    continue
                subcategory_prodcount = subcategory_prodcount_holder[0].replace(";nbsp&"," ").strip()

                m = re.match("\(([0-9,]+)\)", subcategory_prodcount)
                if m:
                    subcategory_prodcount = m.group(1).replace(",","")
                

                item = CategoryItem()
                item['url'] = subcategory_url
                item['text'] = subcategory_text

                item['parent_text'] = parent_item['text']
                item['parent_url'] = parent_item['url']

                # considering departments to be level 2 categories (top level) - so every category must have a department text
                assert 'department_text' in parent_item
                if 'department_text' in parent_item:
                    item['department_text'] = parent_item['department_text']
                    #item['department_url'] = parent_item['department_url']
                    item['department_id'] = parent_item['department_id']

                # only level 2 categories in extra_categories have department_url
                if 'department_url' in parent_item:
                    item['department_url'] = parent_item['department_url']
                else:
                    assert not self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls)
                    if self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls):
                        print "DEPARTMENT_TEXT", item['department_text'], "--"+str(self.find_matching_key(item['department_text'], self.extra_toplevel_categories_urls))+"--"

                # else:
                #     # the parent must be a level 2 category - so this will be considered department
                #     assert parent_item['level'] == 2
                #     item['department_text'] = item['text']
                #     #item['department_url'] = item['url']
                #     item['department_id'] = self.department_count
                #     self.department_count += 1

                item['level'] = parent_item['level'] - 1

                item['nr_products'] = subcategory_prodcount

                # # no description extracted
                # item['description_wc'] = 0


                # send to parseCategory to extract description as well
                yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

Ejemplo n.º 30

0

Mostrar archivo

Archivo: walmart_spider.py Proyecto: Blas-P12/scrapy-project

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        parent_links = hxs.select(
            "//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']"
        )

        # for link in links:
        #     item = CategoryItem()

        #     # search for the category's parent
        #     parents = []

        #     # select the preceding siblings that are a category title (have a child that is an a tag with a certain class)
        #     parents = link.select('parent::node()').select('preceding-sibling::node()').select('child::a[@class=\'NavXLBold\']')

        #     # if we found such siblings, get the last one to be the parent
        #     if parents:
        #         item['parent_text'] = parents[-1].select('text()').extract()[0]
        #         item['parent_url'] = parents[-1].select('@href').extract()[0]

        #         item['parent_url'] = Utils.add_domain(item['parent_url'], self.root_url)

        #     item['text'] = link.select('text()').extract()[0]
        #     item['url'] = link.select('@href').extract()[0]

        #     # add domain if relative URL
        #     item['url'] = Utils.add_domain(item['url'], self.root_url)

        #     item['level'] = 0

        # to avoid duplicates, only extract highest level categories in this function (so don't return if level 0)
        #yield item

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        #TO remove:
        # # artificial category - parent to all departments (root of entire sitemap tree). used to get total walmart product count
        # sitemap_root = CategoryItem()
        # sitemap_root['url'] = "http://www.walmart.com"
        # sitemap_root['text'] = "Walmart"
        # sitemap_root['department_id'] = 0
        # sitemap_root['level'] = 2
        # sitemap_root['catid'] = 0
        # self.id_count += 1
        # yield sitemap_root

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})