Exemple #1
0
    def extractSubcategoriesFromMenu(self, hxs):

        # extract subcategories for regular page structure
        subcategories = hxs.select(
            "//h2[text()='Department']/following-sibling::ul[1]/li/a")
        # only try "Shop by Department" if there is no "Department", otherwise might cause problems when both are present. e.g (http://www.amazon.com/Watches-Mens-Womens-Kids-Accessories/b/ref=sd_allcat_watches/187-9021585-5419616?ie=UTF8&node=377110011)
        if not subcategories:
            subcategories = hxs.select(
                "(//h2 | //h3)[text()='Shop by Department']/following-sibling::ul[1]/li/a"
            )

        for subcategory in subcategories:
            # if we have a subcategory URL and product count with the expected format extract it, otherwise move on

            # there is an exception to this refinement link rule - then extract info directly from subcategory node, but only if len(text)>1 (otherwise we catch all the little arrows for parent cats)
            if not subcategory.select("span[@class='refinementLink']"):
                if len(subcategory.select(".//text()").extract()
                       [0].strip()) > 1:  # so it's not that little arrow thing
                    subcategory_text_holder = subcategory.select(
                        "text()[normalize-space()!='']").extract()
                    if subcategory_text_holder:
                        subcategory_text = subcategory_text_holder[0].strip()
                    else:
                        continue
                    subcategory_url_holder = subcategory.select(
                        "@href").extract()
                    if subcategory_url_holder:
                        subcategory_url = Utils.add_domain(
                            subcategory_url_holder[0], "http://www.amazon.com")
                    else:
                        continue
                    subcategory_prodcount_holder = None
                else:
                    continue

            else:

                subcategory_url = Utils.add_domain(
                    subcategory.select("@href").extract()[0],
                    "http://www.amazon.com")
                subcategory_text = subcategory.select(
                    "span[@class='refinementLink']//text()").extract(
                    )[0].strip()
                # extract product count, clean it of commas and parantheses
                subcategory_prodcount_holder = subcategory.select(
                    "span[@class='narrowValue']/text()").extract()

            # if there's also product count available in the menu, extract it
            if subcategory_prodcount_holder:
                subcategory_prodcount = subcategory_prodcount_holder[
                    0].replace(";nbsp&", " ").strip()

                m = re.match("\(([0-9,]+)\)", subcategory_prodcount)
                if m:
                    subcategory_prodcount = m.group(1).replace(",", "")
            else:
                subcategory_prodcount = None

            yield (subcategory_text, subcategory_url, subcategory_prodcount)
Exemple #2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        subcats_links = hxs.select(
            "//h2[contains(text(),'categories')]/following-sibling::ul[1]/li/a"
        )
        for subcat_link in subcats_links:
            # extract name
            subcat_name = subcat_link.select(
                "span/text()").extract()[0].strip()
            # extract url, add domain
            subcat_url = Utils.add_domain(
                subcat_link.select("@href").extract()[0], self.base_url)

            # send subcategories to be further parsed
            # if brand filter is set, senf to parseSubcategory for brands to be extracted etc
            if self.brands:
                yield Request(url=subcat_url,
                              callback=self.parseSubcategory,
                              meta={'category': subcat_name})
            # if brand filter is not set, send directly to extract products
            else:
                yield Request(url=subcat_url,
                              callback=self.parseBrandPage,
                              meta={'category': subcat_name})
Exemple #3
0
    def parse_resultsPage(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']
        result = hxs.select(
            "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink'][position()<2]/@href"
        ).extract()
        if result:
            item['walmart_full_url'] = Utils.add_domain(
                result[0], "http://www.walmart.com")

            # id should be somewhere in the full URL as well
            if self.valid_result(item['walmart_full_url'], item['walmart_id']):
                return item
            else:
                # search again, but select result that contains id
                #OBS: non optimal, should do selecting here
                return Request(response.url,
                               callback=self.parse_resultsPage2,
                               meta={"item": item})
        else:
            # try to find result by using the product name instead

            # get product name from product page, then search by it
            return Request(item['walmart_short_url'],
                           callback=self.getProductName,
                           meta={"item": item})
    def parseSubcats(self, response):
        hxs = HtmlXPathSelector(response)

        parent = response.meta['parent']

        # extract subcategories
        subcats_links = hxs.select(
            "//div[@class='sideNav']/div[@class='innerWrap'][1]//ul/li/a")
        for subcat_link in subcats_links:
            item = CategoryItem()

            item['url'] = Utils.add_domain(
                subcat_link.select("@href").extract()[0],
                "http://www.tigerdirect.com")
            item['text'] = subcat_link.select("text()").extract()[0]

            item['parent_text'] = parent['text']
            item['parent_url'] = parent['url']
            item['level'] = parent['level'] - 1

            item['department_text'] = response.meta['department_text']
            item['department_id'] = response.meta['department_id']
            item['department_text'] = response.meta['department_text']

            #print 'passing to parse category ', item

            # there are some loops in their categories tree, so we need to check this to avoid infinite loops in crawling
            if item['url'] not in self.parsed_urls:
                yield Request(url = item['url'], callback = self.parseCategory,\
                 meta = {'item' : item,\
                 'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                  'department_id' : response.meta['department_id']})
Exemple #5
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        # get search results for received results page and add them to product_urls to be parsed
        # Note: xpath below ignores Sponsored links (which is good)
        results = hxs.select("//div[@class='a-row a-spacing-small']/a")
        for result in results:
            product_url = result.select("@href").extract()[0]

            # remove the part after "/ref" containing details about the search query
            m = re.match("(.*)/ref=(.*)", product_url)
            if m:
                product_url = m.group(1)

            product_url = Utils.add_domain(product_url, self.domain)

            self.results[origin_product_id]['search_requests'][current_query][
                'search_results'].append(product_url)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request further to parse product pages only if we gathered all the product URLs from all the queries
        # (there are no more pending requests)
        # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results
        # this way we avoid duplicates
        if product_urls and ('pending_requests' not in response.meta
                             or not response.meta['pending_requests']):
            next_product_url = product_urls.pop()
            request = Request(next_product_url,
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            # remove the urls you've just consumed
            self.remove_result_from_queue(origin_product_id, next_product_url)

            return request

        # if there were no results, the request will never get back to reduceResults
        # so send it from here so it can parse the next queries
        # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet)
        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        else:
            response.meta['parsed'] = True
            # only send the response we have as an argument, no need to make a new request
            return self.reduceResults(response)
    def parseSubcategory(self, response):
        hxs = HtmlXPathSelector(response)

        subcategory = response.meta['item']

        # yield this subcategory
        yield subcategory

        # if subcategory was special, we'll mark all subsubcategories as special
        if 'special' in subcategory:
            special = True
        else:
            special = False

        # get its subcategories
        subsubcategories = hxs.select(
            "//div[@class='product-category-expanded']//h3[@class='title']")

        for subsubcategory in subsubcategories:
            item = CategoryItem()
            item['text'] = subsubcategory.select("a/text()").extract()[0]
            item['url'] = Utils.add_domain(
                subsubcategory.select("a/@href").extract()[0], self.base_url)

            if special:
                item['special'] = 1

            item['parent_text'] = subcategory['text']
            item['parent_url'] = subcategory['url']
            item['department_text'] = subcategory['department_text']
            item['department_url'] = subcategory['department_url']
            item['department_id'] = subcategory['department_id']

            item['level'] = subcategory['level'] - 1

            description_text_holder = subsubcategory.select(
                "following-sibling::p[@class='description'][1]/text()"
            ).extract()
            if description_text_holder:
                item['description_text'] = description_text_holder[0]
                item['description_title'] = item['text']
                description_tokenized = Utils.normalize_text(
                    item['description_text'])
                item['description_wc'] = len(description_tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

            # parse subcategory page to get product count, or further subsubcategory
            yield Request(item['url'],
                          callback=self.parseSubcategoryPage,
                          meta={'item': item})
Exemple #7
0
    def extractSubcategoriesSports(self, hxs):
        subcategories = hxs.select(
            "//h3[text()='Shop by Sport']/following-sibling::ul[1]/li/a")

        for subcategory in subcategories:
            subcategory_name = subcategory.select("text()").extract()[0]
            subcategory_url = Utils.add_domain(
                subcategory.select("@href").extract()[0],
                "http://www.amazon.com")

            yield (subcategory_name, subcategory_url, None)
    def parsePage_tigerdirect(self, response):
        hxs = HtmlXPathSelector(response)
        #print "IN PARSEPAGE ", response.url

        # without the "resultsWrap" div, these are found on pages we don't want as well
        product_links = hxs.select(
            "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href"
        ).extract()
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = Utils.add_domain(
                product_link, "http://www.tigerdirect.com")
            # remove CatId from URL (generates duplicates)
            m = re.match("(.*)&CatId=[0-9]+", item['product_url'])
            if m:
                item['product_url'] = m.group(1)
            yield item

        # parse next pages (if results spread on more than 1 page)
        #TODO: not sure if all of them are extracted
        next_page = hxs.select("//a[@title='Next page']")
        if next_page:
            #print "next page : ", response.url, " + ", next_page
            page_nr = response.meta['page'] + 1
            # base_url = response.meta['base_url']
            # # remove trailing "&" character at the end of the URL
            # m = re.match("(.*)&", base_url)
            # if m:
            # 	base_url = m.group(1)
            # yield Request(url = base_url + "&page=%d"%page_nr, callback = self.parsePage_tigerdirect,\
            # 	 meta = {'page' : page_nr, 'base_url' : response.meta['base_url']})
            next_page_url = Utils.add_domain(
                next_page.select("@href").extract()[0],
                "http://www.tigerdirect.com")
            yield Request(url = next_page_url, callback = self.parsePage_tigerdirect,\
             meta = {'page' : page_nr})

        # if you can't find product links, you should search for links to the subcategories pages and parse them for product links
        if not product_links:
            yield Request(url=response.url,
                          callback=self.parseSubcats_tigerdirect)
Exemple #9
0
    def extractSubcategoriesAccessories(self, hxs):
        subcategories = hxs.select("//a[contains(text(),'Shop All')]")
        for subcategory in subcategories:
            # extract words after "Shop All" - that is the subcategory name
            subcategory_text_full = subcategory.select("text()").extract()[0]
            m = re.match("Shop All (.*)", subcategory_text_full)
            subcategory_name = m.group(1).strip()
            subcategory_url = Utils.add_domain(
                subcategory.select("@href").extract()[0],
                "http://www.amazon.com")

            yield (subcategory_name, subcategory_url, None)
    def parseSubcats_tigerdirect(self, response):
        hxs = HtmlXPathSelector(response)

        # search for a link to "See All Products"
        seeall = hxs.select(
            "//span[text()='See All Products']/parent::node()/@href").extract(
            )
        if seeall:
            # pass the new page to this same method to be handled by the next branch of the if statement
            yield Request(url=Utils.add_domain(seeall[0],
                                               "http://www.tigerdirect.com"),
                          callback=self.parseSubcats_tigerdirect)
        else:
            # extract subcategories
            subcats_links = hxs.select(
                "//div[@class='sideNav']/div[@class='innerWrap'][1]//ul/li/a")
            for subcat_link in subcats_links:
                subcat_url = Utils.add_domain(
                    subcat_link.select("@href").extract()[0],
                    "http://www.tigerdirect.com")
                yield Request(url = subcat_url, callback = self.parsePage_tigerdirect,\
                 meta = {'page' : 1, 'base_url' : subcat_url})
Exemple #11
0
    def extract_result_products(self, response):

        hxs = HtmlXPathSelector(response)

        items = []
        results = hxs.select(
            "//div[@class='list-item-info']/div[@class='sku-title']/h4/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            product_name_holder = result.select("text()").extract()
            if product_name_holder:
                item['product_name'] = product_name_holder[0].strip()
            else:
                self.log("Error: No product name: " + str(response.url) +
                         " from product: " + origin_url,
                         level=log.ERROR)

            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_name' in response.meta:
                item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            model_holder = result.select(
                "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            price_holder = result.select(
                "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]"
            ).extract()
            if price_holder:
                price = price_holder[0].strip()
                price = re.sub(",", "", price)
                price = float(price)
                item['product_target_price'] = price

            items.append(item)

        return items
Exemple #12
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']"
        )
        for result in results:
            item = SearchItem()
            #item['origin_site'] = site

            #TODO: usually the manufacturer is in bold, so maybe use that
            product_name = " ".join(result.select(".//text()").extract())
            # append text that is in <span> if any
            span_text = result.select("./span/text()")

            #TODO: use span text differently, as it is more important/relevant (bold) ?
            for text in span_text:
                product_name += " " + text.extract()
            item['product_name'] = product_name
            rel_url = result.select("@href").extract()[0]

            root_url = "http://www.walmart.com"
            item['product_url'] = Utils.add_domain(rel_url, root_url)

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
                assert self.by_id
            else:
                assert not self.by_id

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
    def parseBrand(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on current page
        category = response.meta['category']

        # set parameters in meta specifying current product count and total product count for this brand
        # to be used for deciding on stop criteria on pagination
        if 'total_product_count' in response.meta:
            product_count = response.meta['total_product_count']
            cur_product_count = response.meta['current_product_count']
        else:
            # extract number of products for this brand
            product_count = int(
                hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0])
            cur_product_count = 0

        # extract products from this page
        product_links = hxs.select(
            "//h3[@class='productTitle']/a/@href").extract()
        # add domain
        product_urls = map(lambda x: Utils.add_domain(x, self.base_url),
                           product_links)

        for product_url in product_urls:
            item = ProductItem()
            # remove parameters in url
            item['product_url'] = Utils.clean_url(product_url)
            item['category'] = category

            yield item

        # add nr of extracted products to current product count
        cur_product_count += len(product_urls)

        # get next page if any
        next_page = self.build_next_page_url(response.url,
                                             product_count,
                                             cur_product_count,
                                             first=('total_product_count'
                                                    not in response.meta))

        if next_page:
            yield Request(url=next_page,
                          callback=self.parseBrand,
                          meta={
                              'total_product_count': product_count,
                              'current_product_count': cur_product_count,
                              'category': category
                          })
Exemple #14
0
    def parseSubcategory(self, response):
        hxs = HtmlXPathSelector(response)

        #print "SUBCATEGORY:", response.url

        # extract link to page containing brands (look for link to 'more')
        brands_menu_page = hxs.select(
            "//h4[contains(text(),'Brand')]/following-sibling::ul[1]/li[@class='more']/a/@data-overlay-url"
        ).extract()

        if brands_menu_page:
            # send request for brands pages to be extracted
            yield Request(url=Utils.add_domain(brands_menu_page[0],
                                               self.base_url),
                          callback=self.parseBrandsMenu,
                          meta={'category': response.meta['category']})
        else:

            # if no 'more' link, extract brand pages directly from this page (it means they are all here)
            brands_pages = hxs.select(
                "//h4[contains(text(),'Brand')]/following-sibling::ul[1]/li/a")

            for brand_page in brands_pages:
                brand_name = brand_page.select(
                    "span[@class='facet-str-name']/text()").extract()[0]
                brand_url = Utils.add_domain(
                    brand_page.select("@href").extract()[0], self.base_url)

                # filter brands if it applies
                if self.brands and not self.name_matches_brands(brand_name):
                    self.log("Omitting brand " + brand_name, level=log.INFO)
                    continue

                # send request for brands page to be parsed and its products extracted
                yield Request(url=brand_url,
                              callback=self.parseBrandPage,
                              meta={'category': response.meta['category']})
    def parse_resultsPage2(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']
        results = hxs.select("//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']/@href").extract()
        for result in results:
            # if the result URL contains the id, this is the correct result
            if self.valid_result(item['walmart_id'], result):
                product_url = Utils.add_domain(result, "http://www.walmart.com")
                item['walmart_full_url'] = product_url
                return item


        # no results matching the condition were found
        self.log("No results for short_url (didn't find any URLs containing id) " + item['walmart_short_url'] + "\n", level=log.ERROR)
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//div[@class='hproduct']/div[@class='info-main']/h3/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select("text()").extract()[0].strip()
            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
            # 	assert self.by_id
            # else:
            # 	assert not self.by_id

            model_holder = result.select(
                "parent::node()/parent::node()//strong[@itemprop='model']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
    def extract_results(self, response):
        hxs = HtmlXPathSelector(response)

        # TODO: check this xpath and extractions
        results = hxs.select("//h4[@class='tile-heading']/a")
        product_urls = set()

        # try xpath for old page version
        if not results:
             results = hxs.select("//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']")

        for result in results:
            product_url = result.select("@href").extract()[0]
            product_url = Utils.add_domain(product_url, "http://www.walmart.com")
            product_urls.add(product_url)

        return list(product_urls)
Exemple #18
0
    def parseBrandPage(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on this page
        category = response.meta['category']

        # extract item count
        if 'item_count' in response.meta:
            total_item_count = reponse.meta['item_count']
        else:
            total_item_count = int(
                hxs.select("//p[@id='filtered-products-count']").re("[0-9]+")
                [0])

        # extract product holder. not extracting <a> element directly because each product holder has many a elements (all just as good, but we only want one)
        product_holders = hxs.select("//div[@class='product ']")
        for product_holder in product_holders:
            # extract first link in product holder
            product_link = product_holder.select(".//a/@href").extract()[0]
            product_url = Utils.add_domain(product_link, self.base_url)

            item = ProductItem()
            item['product_url'] = product_url
            item['category'] = category

            yield item

        # crawl next pages if any left
        if 'offset' not in response.meta:
            offset = 0
        else:
            offset = response.meta['offset']

        next_page = self.build_next_page_url(response.url, total_item_count,
                                             offset)

        # if there are more products to crawl, send new request
        if next_page:
            yield Request(url=next_page,
                          callback=self.parseBrandPage,
                          meta={
                              'offset': offset + 1,
                              'total_item_count': total_item_count,
                              'category': category
                          })
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        brands_links = hxs.select("//li[contains(@class,'brandsSel')]/a")
        for brand_link in brands_links:
            brand_name = brand_link.select(
                "text()[normalize-space()]").extract()[0].strip()
            brand_url = Utils.add_domain(
                brand_link.select("@href").extract()[0], self.base_url)

            # filter brand if brand filter set
            if self.brands and not self.name_matches_brands(brand_name):
                self.log("Omitting brand " + brand_name, level=log.INFO)
                continue

            # crawl brand page if it passed filter
            yield Request(url=brand_url,
                          callback=self.parseBrand,
                          meta={'category': response.meta['category']})
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        categories_links = hxs.select("//div[@class='nav baseLevel']/ul/li/a")
        for category_link in categories_links:
            category_name = category_link.select("text()").extract()[0]
            category_url = Utils.add_domain(
                category_link.select("@href").extract()[0], self.base_url)

            # if brand filter is set, send to parseCategory to extract brands pages from menu
            if self.brands:
                yield Request(url=category_url,
                              callback=self.parseCategory,
                              meta={'category': category_name})
            # if we're extracting all brands, send it directly to extract products from it
            else:
                yield Request(url=category_url,
                              callback=self.parseBrand,
                              meta={'category': category_name})
Exemple #21
0
    def parseBrandsMenu(self, response):
        hxs = HtmlXPathSelector(response)

        # extract links to brands pages
        brands_links = hxs.select("//ul/li/a")
        for brand_link in brands_links:
            brand_name = brand_link.select(
                "@data-facet-option-value").extract()[0]

            # filter brands if it applies
            if self.brands and not self.name_matches_brands(brand_name):
                self.log("Omitting brand " + brand_name, level=log.INFO)
                continue

            # build brand url
            try:

                # extract brand id
                brand_id = brand_link.select(
                    "@data-facet-option-id").extract()[0]
                # extract base url for brand page
                brand_base_url = Utils.add_domain(
                    hxs.select("//form/@action").extract()[0], self.base_url)
                # extract relative url parameters for brand page
                brand_relative_url_params = hxs.select(
                    "//input/@value").extract()[0]
                # extract catId parameter
                cat_id_param = re.findall("catId=[0-9]+(?=&|$)",
                                          brand_relative_url_params)[0]
                # build brand page
                brand_page_url = brand_base_url + "?" + cat_id_param + "+" + str(
                    brand_id)

                #print brand_page_url

                yield Request(url=brand_page_url,
                              callback=self.parseBrandPage,
                              meta={'category': response.meta['category']})

            except Exception, e:
                self.log("Couldn't extract brand page from menu: " + e,
                         level=log.ERROR)
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})
Exemple #23
0
    def parse(self, response):

        if self.product_name:

            # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites)
            if not self.target_site:
                self.log(
                    "You can't use the product_name option without setting the target site to search on\n",
                    level=log.ERROR)
                raise CloseSpider(
                    "\nYou can't use the product_name option without setting the target site to search on\n"
                )

            search_query = self.build_search_query(self.product_name)
            search_pages = self.build_search_pages(search_query)

            request = Request(search_pages[self.target_site],
                              callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request.meta['origin_name'] = self.product_name
            request.meta['query'] = search_query

            # just use empty product model and url, for compatibility, also pending_requests
            request.meta['origin_model'] = ''
            request.meta['origin_url'] = ''
            request.meta['pending_requests'] = []

            yield request

        # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults)
        product_urls = []
        # if we have a single product URL, create a list of URLs containing it
        if self.product_url:
            product_urls.append(self.product_url)

        # if we have a file with a list of URLs, create a list with URLs found there
        if self.product_urls_file:
            f = open(self.product_urls_file, "r")
            for line in f:
                product_urls.append(line.strip())
            f.close()

        for product_url in product_urls:
            # extract site domain

            # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url)
            # origin_site = ""
            # if m:
            # 	origin_site = m.group(1)
            # else:
            # 	sys.stderr.write('Can\'t extract domain from URL.\n')
            origin_site = Utils.extract_domain(product_url)

            request = Request(product_url, callback=self.parseURL)
            request.meta['origin_site'] = origin_site
            if origin_site == 'staples':
                zipcode = "12345"
                request.cookies = {"zipcode": zipcode}
                request.meta['dont_redirect'] = True
            yield request

        # if we have a file with Walmart ids, create a list of the ids there
        if self.walmart_ids_file:
            walmart_ids = []
            f = open(self.walmart_ids_file, "r")
            for line in f:
                if "," in line:
                    id_string = line.strip().split(",")[0]
                else:
                    id_string = line.strip()
                if re.match("[0-9]+", id_string):
                    walmart_ids.append(id_string)
            f.close()

            self.by_id = True

            for walmart_id in walmart_ids:
                # create Walmart URLs based on these IDs
                walmart_url = Utils.add_domain(walmart_id,
                                               "http://www.walmart.com/ip/")
                request = Request(walmart_url, callback=self.parseURL)
                #request.meta['origin_site'] = 'walmart'
                yield request
Exemple #24
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//ul[@class='products']//div[@class='product ']//h3//a")
        for result in results:
            item = SearchItem()

            product_url = result.select("@href").extract()[0] if result.select(
                "@href") else None
            product_name = result.select(
                "@title").extract()[0] if result.select("@title") else None

            # assert name is not abbreviated
            # empirically, this only seems to produce false positives, so removed
            # assert '...' not in product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(product_url,
                                                       self.base_url)

                item['product_name'] = product_name
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
Exemple #25
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # get parent item from response, extract additional info and return it
        item = response.meta['parent']

        # add department name, url and id for item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract product count if available
        nr_items_holder = hxs.select(
            "//div[@id='showing']/strong[position()=2]/text()").extract()
        if nr_items_holder:
            item['nr_products'] = int(str(nr_items_holder[0]))

        # extract description if available
        # these are descriptions for  services pages
        desc_title_holder = hxs.select(
            "//div[@id='searchstate']/a[position()=2]/text()").extract()
        if desc_title_holder:
            item['description_title'] = desc_title_holder[0].strip()
        desc_content_holder = hxs.select(
            "//div[@class='content']/h3/text()").extract()
        if desc_content_holder:
            item['description_text'] = desc_content_holder[0].strip()
            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)
            (item['keyword_count'],
             item['keyword_density']) = Utils.phrases_freq(
                 item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        # extract its subcategories
        #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']")
        subcats_holders = hxs.select(
            "//div[@class='narrowcontent']/ul[contains(@class,'search')]")
        if subcats_holders:
            subcats_holder = subcats_holders[0]
            # these are subcategories if they are preceded by the title "Shop ..."
            title = subcats_holder.select(
                "parent::node()/preceding-sibling::node()//text()").extract(
                )[0]
            if str(title).startswith("Shop"):
                subcats = subcats_holder.select(".//li/a")
                for subcat in subcats:
                    item = CategoryItem()
                    item['text'] = subcat.select("text()").extract()[0].strip()
                    item['url'] = Utils.add_domain(
                        subcat.select("@href").extract()[0],
                        "http://www.bestbuy.com")
                    parent = response.meta['parent']
                    item['level'] = int(response.meta['level']) - 1
                    # if parent was special, this category is special too
                    if 'special' in parent:
                        item['special'] = 1
                    item['parent_text'] = parent['text']
                    item['parent_url'] = parent['url']

                    request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                        'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                    yield request
    def parsePage(self, response):

        #print "IN PARSEPAGE"
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        if 'parent_item' in response.meta:
            parent_item = response.meta['parent_item']
            item['parent_text'] = parent_item['text']
            item['parent_url'] = parent_item['url']
            if 'parent_text' in parent_item:
                item['grandparent_text'] = parent_item['parent_text']
                item['grandparent_url'] = parent_item['parent_url']
            if 'nr_products' not in parent_item:
                parent_nr_products = 0
            else:
                parent_nr_products = parent_item['nr_products']

        # initialize product URL list
        if 'products' not in response.meta:
            products = []
        else:
            products = response.meta['products']

        # # if this is the first page, initialize number of products
        # if 'nr_products' not in item:
        #     old_nr_products = 0
        # else:
        #     old_nr_products = item['nr_products']

        # find number of products on this page
        product_links = hxs.select(
            "//a[@class='prodLink ListItemLink']/@href").extract()

        # gather all products in this (sub)category
        products += product_links

        #this_nr_products = len(product_links)

        #item['nr_products'] = old_nr_products + this_nr_products
        # if 'parent_item' in response.meta:
        #     parent_item['nr_products'] = parent_nr_products + item['nr_products']
        # find URL to next page, parse it as well
        next_page = hxs.select(
            "//a[@class='link-pageNum' and text()=' Next ']/@href").extract()
        if next_page:
            page_url = Utils.add_domain(next_page[0], self.root_url)
            request = Request(url=page_url,
                              callback=self.parsePage,
                              meta={
                                  'item': item,
                                  'products': products
                              })
            if 'parent_item' in response.meta:
                request.meta['parent_item'] = parent_item
            yield request

        # if no next page, return current results; and return parent category page
        else:

            item['nr_products'] = len(set(products))
            yield item
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        # extract departments
        departments = hxs.select("//h2")
        department_id = 0
        for department in departments:
            department_item = CategoryItem()
            department_text = department.select("text()").extract()[0]

            department_item['department_text'] = department_text

            # #TODO: add department_url, from sherwin-williams.com ...? get department list from there and match with departments from here by seeing if names match

            department_item['department_id'] = department_id

            department_item['text'] = department_text

            department_item['level'] = 1

            # get categories in department
            categories = department.select("following-sibling::ul[1]/li")

            # extract department url from one of its categories urls (it's not available directly)
            category_ex = categories[0]
            category_ex_url = Utils.add_domain(
                category_ex.select("a/@href").extract()[0], self.base_url)
            # extract first part of url
            m = re.match("(http://www.sherwin\-williams\.com/[^/]+)/.*",
                         category_ex_url)
            department_url = m.group(1)
            department_item['department_url'] = department_url
            department_item['url'] = department_url

            for category in categories:
                item = CategoryItem()
                #TODO: special if 'Services'? or Specifications, or Ads...
                category_text = category.select("a/text()").extract()[0]
                category_url = Utils.add_domain(
                    category.select("a/@href").extract()[0], self.base_url)
                item['text'] = category_text
                item['url'] = category_url

                # if it's not a 'products' category, mark it and all its subcategories as special

                if category_text != 'Products':
                    item['special'] = 1
                    special = True
                else:
                    special = False

                item['department_id'] = department_id
                item['department_text'] = department_text
                item['department_url'] = department_url

                item['parent_text'] = department_text
                item['parent_url'] = department_url

                item['level'] = 0

                #TODO: do we need description_wc here as well?

                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

                # get subcategories in category
                subcategories = category.select("ul/li")
                for subcategory in subcategories:
                    item = CategoryItem()

                    item['text'] = subcategory.select("a/text()").extract()[0]
                    item['url'] = Utils.add_domain(
                        subcategory.select("a/@href").extract()[0],
                        self.base_url)

                    item['department_id'] = department_id
                    item['department_text'] = department_text
                    item['department_url'] = department_url

                    item['parent_text'] = category_text
                    item['parent_url'] = category_url

                    item['level'] = -1

                    # if parent is special, category is special
                    if special:
                        item['special'] = 1

                    yield Request(item['url'],
                                  callback=self.parseSubcategory,
                                  meta={'item': item})

            department_id += 1

            # return department
            yield department_item
class WalmartCaSpider(BaseSpider):
    name = "walmartca"
    allowed_domains = ["walmart.ca"]
    start_urls = [
        "http://www.walmart.ca/en",
    ]

    def __init__(self, outfile=None):
        self.root_url = "http://www.walmart.ca"
        self.outfile = outfile

        # set flag that indicates that for this spider, nr of products for each catgory should be computed
        self.compute_nrproducts = True

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 1

        # keep crawled items represented by (url, parent_url, department_url) pairs
        # to eliminate duplicates
        # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled)
        self.crawled = []

        # last used category id, used for autoincrementing ids idenrifying categories
        self.id_count = 0

        # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page
        self.special_itemcount = {'value of the day': 2}

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

    # parse category page and extract description and number of products
    def parseCategory(self, response):

        # URLs like health.walmart.com don't have body_as_unicode and generate an exception
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError, e:
            self.log("Could not get response from " + response.url +
                     "; original exception: " + str(e) + "\n",
                     level=log.WARNING)
            return

        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))

        # set item count for special items (hardcoded in special_itemcount)
        if item['text'].lower() in self.special_itemcount:
            item['nr_products'] = self.special_itemcount[item['text'].lower()]

        # Extract subcategories if no product count found
        if 'nr_products' in item:
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item
Exemple #29
0
    def parseResults_samsung(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # add product URLs to be parsed to this list
        if 'search_results' not in response.meta:
            product_urls = set()
        else:
            product_urls = response.meta['search_results']

        #TODO: implement support for multiple results pages?

        # if we find any results to this it means we are already on a product page
        results = hxs.select("//ul[@class='product-info']")

        if results:
            product_urls.add(response.url)
            # it also means it's an exact match, so stop search here
            response.meta['pending_requests'] = []
            response.meta['threshold'] = 0.2
            # # also temporarily lower threshold
            # self.threshold = 0.2

        else:

            # try to see if this is a results page then

            # Content seems to be generated with javascript - open page with selenium, extract its content then return it back here
            # try to see if the page contains what we need, or we need to try it with selenium
            results = hxs.select(
                "//input[contains(@id,'detailpageurl')]/@value")
            if not results:
                print 'NO RESULTS: ', response.url

                #results = []

                # COMMENTED FOR TESTING
                # use selenium
                request = self.get_samsung_results(response.url)
                # get body of request
                request_body = request.body
                resp_for_scrapy = TextResponse('none', 200, {}, request_body,
                                               [], None)

                hxs = HtmlXPathSelector(resp_for_scrapy)
                #print "PAGE_SOURCE: ", page_source
                results = hxs.select(
                    "//input[contains(@id,'detailpageurl')]/@value")
            else:
                print 'WE ALREADY HAD RESULTS! '
                print 'RESULTS: ', results

            for result in results:
                product_url = Utils.add_domain(result.extract().strip(),
                                               "http://www.samsung.com")
                product_urls.add(product_url)

        if product_urls and ('pending_requests' not in response.meta
                             or not response.meta['pending_requests']):
            request = Request(product_urls.pop(),
                              callback=self.parse_product_samsung,
                              meta=response.meta)
            request.meta['items'] = items

            # this will be the new product_urls list with the first item popped
            request.meta['search_results'] = product_urls

            return request

        # if there were no results, the request will never get back to reduceResults
        else:

            # # we are finished and should close the driver
            # if self.driver:
            #     self.driver.close()

            response.meta['items'] = items
            response.meta['parsed'] = True
            response.meta['search_results'] = product_urls
            # only send the response we have as an argument, no need to make a new request
            return self.reduceResults(response)
Exemple #30
0
#!/usr/bin/python

# Get ids from a CSV file containing one on each line, and generate Walmart product URLs based on them

import sys
import re
from spiders_utils import Utils

base_url = "http://www.walmart.com/ip/"
with open(sys.argv[1]) as idsfile:
    for line in idsfile:
        # if there are other fields ignore them (get the first one)
        if "," in line:
            id_string = line.strip().split(",")[0]
        else:
            id_string = line.strip()
        # if it's not a number ignore it (could be a header line)
        if re.match("[0-9]+", id_string):
            # generate URL and output it
            url = Utils.add_domain(id_string, base_url)
            print url