Beispiel #1
0
    def parse(self, response):
        category = response.meta.get('category')
        if category is None:
            breadcrumbs = response.css('#breadcrumb ul li::text').extract()
            if breadcrumbs:
                category = breadcrumbs[-1].strip(u'>\xa0')

        subcategories = self._scrape_subcategory_links(response)
        if subcategories:
            for link in subcategories:
                category_text = re.search('(.+?) \(\d+\)', link.text)
                category_text = category_text.groups(
                )[0] if category_text else link.text
                yield Request(link.url, meta={'category': category_text})
        elif response.css('#breadcrumb'):
            for link in self._scrape_product_links(response):
                print ProductItem(product_url=link.url, category=category)
                yield ProductItem(product_url=link.url, category=category)
            next_link = response.xpath('//a[@rel="next"]/@href')
            if next_link:
                yield Request(next_link.extract()[0],
                              meta={'category': category})
        else:
            for link in self._scrape_department_links(response):
                yield Request(link.url, meta={'category': link.text.strip()})
    def parsePage_overstock(self, response):
        hxs = HtmlXPathSelector(response)

        product_links = hxs.select("//a[@class='pro-thumb']/@href")
        items = []
        for product_link in product_links:
            item = ProductItem()
            url = product_link.extract()

            # remove irrelevant last part of url
            m = re.match("(.*product\.html)\?re.*", url)
            if m:
                url = m.group(1)
            item['product_url'] = url
            yield item

        # get next pages, stop when you find no more product urls
        # url = http://www.overstock.com/Electronics/Laptops/133/subcat.html?index=101&sort=Top+Sellers&TID=SORT:Top+Sellers&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true
        if product_links:
            # # get category, and if it's laptops treat it specially using the hardcoded url
            # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page)
            # if m and m.group(1) == "Laptops":
            # parse next pages as well
            index = int(response.meta['index']) + 25
            yield Request(url = self.cat_page + "&index=" + str(index) + "&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \
              headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \
              meta = {"index" : index})
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)

        try:
            category = hxs.select(
                "//a[@class='a-link-normal a-color-base a-text-bold a-text-normal']/text()"
            ).extract()[0]
        except Exception:
            category = None

        try:
            nr_results = hxs.select("//h2[@id='s-result-count']/text()").re(
                "[0-9,]+")[-1]
            print nr_results, "FOR", category
        except Exception:
            pass

        product_links = hxs.select(
            "//div[contains(@class,'a-row')]//a[contains(@class, 'a-link-normal s-access-detail-page  a-text-normal')]/@href"
        )
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = product_link.extract()
            item['category'] = category
            yield item

        # select next page, if any, parse it too with this method
        root_url = "http://www.amazon.co.uk"
        next_page = hxs.select("//a[@title='Next Page']/@href").extract()

        if next_page:
            page_url = root_url + next_page[0]
            yield Request(url=page_url, callback=self.parsePage)
Beispiel #4
0
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)

        if response.url in self.parsed_pages:
            return
        else:
            self.parsed_pages.append(response.url)

        product_links = hxs.select("//div[@class='itemText']/div[@class='wrapper']/a")

        # if you don't find any product links, try to crawl subcategories in left menu,
        # but only the ones under the first part of the menu
        # do this by selecting all dd elements in the menu until a dt (another title) element is found
        if not product_links:
            # select first element in menu
            el = hxs.select("//dl[@class='categoryList primaryNav']//dd[1]")

            # while we still find another subcategory in the menu before the next title
            while el:
                # parse the link as a subcategory
                subcat_url = el.select("a/@href").extract()[0]
                # clean URL of parameters. 
                # if this is not done, it will end up in a infinite loop below (constructing next pages urls, they will always point to the same page, first one)
                m = re.match("([^\?]+)\?.*", subcat_url)
                if m:
                    subcat_url = m.group(1)
                yield Request(url = subcat_url, callback = self.parsePage, meta = {'page' : 1})

                # get next element in menu (that is not a title)
                el = el.select("following-sibling::*[1][self::dd]")

        else:

            for product_link in product_links:
                item = ProductItem()
                item['product_url'] = product_link.select("@href").extract()[0]
                yield item

            # crawl further pages - artificially construct page names by changing parameter in URL
            # only try if there is a "next" link on the page, pointing to the next page, so as not to be stuck in an infinite loop

            next_page = hxs.select("//li[@class='enabled']/a[@title='next']")
            if next_page:
                page = int(response.meta['page']) + 1
                next_url = ""
                if page == 2:
                    next_url = response.url + "/Page-2"
                else:
                    m = re.match("(http://www.newegg.com/.*Page-)[0-9]+", response.url)
                    if m:
                        next_url = m.group(1) + str(page)

                    else:
                        self.log("Error: not ok url " + response.url + " , page " + str(page), level=log.WARNING)
                        return

                yield Request(url = next_url, callback = self.parsePage, meta = {'page' : page})
                #print 'next url ', next_url
 def parsePage(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     products = hxs.select("//div[@class='shortDescription']")
     for product in products:
         item = ProductItem()
         item['product_url'] = product.select("a/@href").extract()[0]
         items.append(item)
     return items
    def parsePage_macys(self, response):
        hxs = HtmlXPathSelector(response)
        root_url = "http://www1.macys.com"

        # extract product URLs
        product_links = hxs.select("//div[@class='shortDescription']/a/@href")
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = root_url + product_link.extract()
            yield item
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//li[@class='product-cell ']/a")
        items = []

        for product in products:
            item = ProductItem()
            item['product_url'] = product.select("@href").extract()[0]
            items.append(item)

        return items
    def parseBsPage(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//div[@class='zg_itemImmersion']")

        for product in products:
            item = ProductItem()
            url = product.select(
                "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/@href"
            ).extract()
            if url:
                item['product_url'] = url[0].strip()
                yield item
Beispiel #9
0
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select("//div[@class='shortDescription']/a")

        items = []
        root_url = "http://www1.macys.com"
        for product in products:
            item = ProductItem()
            item['product_url'] = root_url + product.select("@href").extract()[0]
            items.append(item)

        return items
    def parseBrand(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on current page
        category = response.meta['category']

        # set parameters in meta specifying current product count and total product count for this brand
        # to be used for deciding on stop criteria on pagination
        if 'total_product_count' in response.meta:
            product_count = response.meta['total_product_count']
            cur_product_count = response.meta['current_product_count']
        else:
            # extract number of products for this brand
            product_count = int(
                hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0])
            cur_product_count = 0

        # extract products from this page
        product_links = hxs.select(
            "//h3[@class='productTitle']/a/@href").extract()
        # add domain
        product_urls = map(lambda x: Utils.add_domain(x, self.base_url),
                           product_links)

        for product_url in product_urls:
            item = ProductItem()
            # remove parameters in url
            item['product_url'] = Utils.clean_url(product_url)
            item['category'] = category

            yield item

        # add nr of extracted products to current product count
        cur_product_count += len(product_urls)

        # get next page if any
        next_page = self.build_next_page_url(response.url,
                                             product_count,
                                             cur_product_count,
                                             first=('total_product_count'
                                                    not in response.meta))

        if next_page:
            yield Request(url=next_page,
                          callback=self.parseBrand,
                          meta={
                              'total_product_count': product_count,
                              'current_product_count': cur_product_count,
                              'category': category
                          })
Beispiel #11
0
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        root_url = "http://www.walmart.com"
        product_links = hxs.select("//a[@class='prodLink ListItemLink']/@href")

        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = root_url + product_link.extract()
            yield item

        # select next page, if any, parse it too with this method
        next_page = hxs.select("//a[@class='link-pageNum' and text()=' Next ']/@href").extract()
        if next_page:
            page_url = root_url + next_page[0]
            yield Request(url = page_url, callback = self.parsePage)
    def parse(self, response):
        boxes = self._scrape_product_boxes(response)
        if boxes is None:  # No products are shown here, go deeper into subcategories.
            for request in map(Request, self._scrape_subcategories(response)):
                yield request
        else:
            # Scrape product links.
            category_name = self._scrape_category_name(response)
            for url in map(self._scrape_product_link, boxes):
                yield ProductItem(product_url=url, category=category_name)

            # Go to next page, if availible.
            url = response.css('a.next.i-next::attr(href)')
            if url:
                yield Request(url.extract()[0])
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        root_url = "http://shop.nordstrom.com"

        # extract product URLs
        product_links = hxs.select("//div/a[@class='title']/@href")
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = root_url + product_link.extract()
            yield item

        # select next page, if any, parse it too with this method
        next_page = hxs.select(
            "//ul[@class='arrows']/li[@class='next']/a/@href").extract()
        if next_page:
            page_url = next_page[0]
            yield Request(url=page_url, callback=self.parsePage)
    def parsePage_bestbuy(self, response):
        hxs = HtmlXPathSelector(response)
        root_url = "http://www.bestbuy.com"

        # extract product URLs
        product_links = hxs.select("//div[@class='info-main']/h3/a/@href")
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = root_url + product_link.extract()
            yield item

        # select next page, if any, parse it too with this method
        next_page = hxs.select(
            "//ul[@class='pagination']/li/a[@class='next']/@href").extract()
        if next_page:
            page_url = root_url + next_page[0]
            yield Request(url=page_url, callback=self.parsePage_bestbuy)
Beispiel #15
0
    def parseBrandPage(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on this page
        category = response.meta['category']

        # extract item count
        if 'item_count' in response.meta:
            total_item_count = reponse.meta['item_count']
        else:
            total_item_count = int(
                hxs.select("//p[@id='filtered-products-count']").re("[0-9]+")
                [0])

        # extract product holder. not extracting <a> element directly because each product holder has many a elements (all just as good, but we only want one)
        product_holders = hxs.select("//div[@class='product ']")
        for product_holder in product_holders:
            # extract first link in product holder
            product_link = product_holder.select(".//a/@href").extract()[0]
            product_url = Utils.add_domain(product_link, self.base_url)

            item = ProductItem()
            item['product_url'] = product_url
            item['category'] = category

            yield item

        # crawl next pages if any left
        if 'offset' not in response.meta:
            offset = 0
        else:
            offset = response.meta['offset']

        next_page = self.build_next_page_url(response.url, total_item_count,
                                             offset)

        # if there are more products to crawl, send new request
        if next_page:
            yield Request(url=next_page,
                          callback=self.parseBrandPage,
                          meta={
                              'offset': offset + 1,
                              'total_item_count': total_item_count,
                              'category': category
                          })
    def parsePage_tigerdirect(self, response):
        hxs = HtmlXPathSelector(response)
        #print "IN PARSEPAGE ", response.url

        # without the "resultsWrap" div, these are found on pages we don't want as well
        product_links = hxs.select(
            "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href"
        ).extract()
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = Utils.add_domain(
                product_link, "http://www.tigerdirect.com")
            # remove CatId from URL (generates duplicates)
            m = re.match("(.*)&CatId=[0-9]+", item['product_url'])
            if m:
                item['product_url'] = m.group(1)
            yield item

        # parse next pages (if results spread on more than 1 page)
        #TODO: not sure if all of them are extracted
        next_page = hxs.select("//a[@title='Next page']")
        if next_page:
            #print "next page : ", response.url, " + ", next_page
            page_nr = response.meta['page'] + 1
            # base_url = response.meta['base_url']
            # # remove trailing "&" character at the end of the URL
            # m = re.match("(.*)&", base_url)
            # if m:
            # 	base_url = m.group(1)
            # yield Request(url = base_url + "&page=%d"%page_nr, callback = self.parsePage_tigerdirect,\
            # 	 meta = {'page' : page_nr, 'base_url' : response.meta['base_url']})
            next_page_url = Utils.add_domain(
                next_page.select("@href").extract()[0],
                "http://www.tigerdirect.com")
            yield Request(url = next_page_url, callback = self.parsePage_tigerdirect,\
             meta = {'page' : page_nr})

        # if you can't find product links, you should search for links to the subcategories pages and parse them for product links
        if not product_links:
            yield Request(url=response.url,
                          callback=self.parseSubcats_tigerdirect)
    def parseBrandPage(self, response):
        hxs = HtmlXPathSelector(response)

        # category of items on these page
        category = response.meta['category']

        # extract product urls
        # only select from t1 tab (also see build_url... on omitting t4)
        product_urls = hxs.select("//div[contains(@id,'t1')]//div[@class='pl_productName']/a/@href")
        for url in product_urls:
            item = ProductItem()
            product_url = url.extract().encode("utf-8")
            # convert strange product url characters to percent encoding - only part after last / (the one before the last, since the last is followed by nothing)
            product_url = '/'.join(product_url.split('/')[:-2]) + '/' + urllib.quote(product_url.split('/')[-2]) + '/'
            item['product_url'] = product_url
            item['category'] = category
            yield item


        # crawl next pages if any
        # find if there is a next page
        # select maximum page number on the page
        available_pages = map(lambda x: int(x), hxs.select("//div[contains(@id,'t1')]//div[@class='pagination']/ul/li/a/text()").re("[0-9]+"))
        if available_pages:
            max_page = max(available_pages)
        else:
            max_page = 0
        # extract 'next page' link
        # only select from t1 tab (also see build_url... on omitting t4)
        next_page_link = hxs.select("//div[contains(@id,'t1')]//li[@class='next']/a")
        if next_page_link:
            # extract js call to next page, use it to build the next page url
            js_call_string = self.extract_boots_js_args(next_page_link.select("@href").extract()[0].encode("utf-8"))
            next_page = self.build_boots_param_url(response.url, *(js_call_string), max_page=max_page)

            # if there is no next page, function will return None
            if next_page:
                yield Request(url = next_page, callback = self.parseBrandPage, meta = {'category' : category})
Beispiel #18
0
    def parsePage(self, response):

        hxs = HtmlXPathSelector(response)

        #print 'title parsepage ', hxs.select("//h1/text()").extract()

        products = hxs.select("//a[@class='url']")
        root_url = "http://www.staples.com"

        for product in products:

            item = ProductItem()
            item['product_url'] = root_url + product.select(
                "@href").extract()[0]
            yield item

        #yield items

        nextPage = hxs.select("//li[@class='pageNext']/a/@href").extract()
        zipcode = "12345"
        if nextPage:
            # parse next page, (first convert url from unicode to string)
            yield Request(str(nextPage[0]), callback = self.parsePage, cookies = {"zipcode" : zipcode}, \
                headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True})
    def parsePage(self, response):
        hxs = HtmlXPathSelector(response)
        product_links = hxs.select("//h3[@class='newaps']/a/@href")
        for product_link in product_links:
            item = ProductItem()
            item['product_url'] = product_link.extract()
            yield item

        # select next page, if any, parse it too with this method
        root_url = "http://www.amazon.com"
        next_page = hxs.select("//a[@title='Next Page']/@href").extract()
        if next_page:
            page_url = root_url + next_page[0]
            yield Request(url=page_url, callback=self.parsePage)

        # if no products were found, maybe this was a bestsellers page
        if not product_links:
            yield Request(response.url, callback=self.parseBsPage)

            # get next pages as well
            page_urls = hxs.select(
                "//div[@id='zg_paginationWrapper']//a/@href").extract()
            for page_url in page_urls:
                yield Request(page_url, callback=self.parseBsPage)