def parse(self, response): category = response.meta.get('category') if category is None: breadcrumbs = response.css('#breadcrumb ul li::text').extract() if breadcrumbs: category = breadcrumbs[-1].strip(u'>\xa0') subcategories = self._scrape_subcategory_links(response) if subcategories: for link in subcategories: category_text = re.search('(.+?) \(\d+\)', link.text) category_text = category_text.groups( )[0] if category_text else link.text yield Request(link.url, meta={'category': category_text}) elif response.css('#breadcrumb'): for link in self._scrape_product_links(response): print ProductItem(product_url=link.url, category=category) yield ProductItem(product_url=link.url, category=category) next_link = response.xpath('//a[@rel="next"]/@href') if next_link: yield Request(next_link.extract()[0], meta={'category': category}) else: for link in self._scrape_department_links(response): yield Request(link.url, meta={'category': link.text.strip()})
def parsePage_overstock(self, response): hxs = HtmlXPathSelector(response) product_links = hxs.select("//a[@class='pro-thumb']/@href") items = [] for product_link in product_links: item = ProductItem() url = product_link.extract() # remove irrelevant last part of url m = re.match("(.*product\.html)\?re.*", url) if m: url = m.group(1) item['product_url'] = url yield item # get next pages, stop when you find no more product urls # url = http://www.overstock.com/Electronics/Laptops/133/subcat.html?index=101&sort=Top+Sellers&TID=SORT:Top+Sellers&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true if product_links: # # get category, and if it's laptops treat it specially using the hardcoded url # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page) # if m and m.group(1) == "Laptops": # parse next pages as well index = int(response.meta['index']) + 25 yield Request(url = self.cat_page + "&index=" + str(index) + "&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \ headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \ meta = {"index" : index})
def parsePage(self, response): hxs = HtmlXPathSelector(response) try: category = hxs.select( "//a[@class='a-link-normal a-color-base a-text-bold a-text-normal']/text()" ).extract()[0] except Exception: category = None try: nr_results = hxs.select("//h2[@id='s-result-count']/text()").re( "[0-9,]+")[-1] print nr_results, "FOR", category except Exception: pass product_links = hxs.select( "//div[contains(@class,'a-row')]//a[contains(@class, 'a-link-normal s-access-detail-page a-text-normal')]/@href" ) for product_link in product_links: item = ProductItem() item['product_url'] = product_link.extract() item['category'] = category yield item # select next page, if any, parse it too with this method root_url = "http://www.amazon.co.uk" next_page = hxs.select("//a[@title='Next Page']/@href").extract() if next_page: page_url = root_url + next_page[0] yield Request(url=page_url, callback=self.parsePage)
def parsePage(self, response): hxs = HtmlXPathSelector(response) if response.url in self.parsed_pages: return else: self.parsed_pages.append(response.url) product_links = hxs.select("//div[@class='itemText']/div[@class='wrapper']/a") # if you don't find any product links, try to crawl subcategories in left menu, # but only the ones under the first part of the menu # do this by selecting all dd elements in the menu until a dt (another title) element is found if not product_links: # select first element in menu el = hxs.select("//dl[@class='categoryList primaryNav']//dd[1]") # while we still find another subcategory in the menu before the next title while el: # parse the link as a subcategory subcat_url = el.select("a/@href").extract()[0] # clean URL of parameters. # if this is not done, it will end up in a infinite loop below (constructing next pages urls, they will always point to the same page, first one) m = re.match("([^\?]+)\?.*", subcat_url) if m: subcat_url = m.group(1) yield Request(url = subcat_url, callback = self.parsePage, meta = {'page' : 1}) # get next element in menu (that is not a title) el = el.select("following-sibling::*[1][self::dd]") else: for product_link in product_links: item = ProductItem() item['product_url'] = product_link.select("@href").extract()[0] yield item # crawl further pages - artificially construct page names by changing parameter in URL # only try if there is a "next" link on the page, pointing to the next page, so as not to be stuck in an infinite loop next_page = hxs.select("//li[@class='enabled']/a[@title='next']") if next_page: page = int(response.meta['page']) + 1 next_url = "" if page == 2: next_url = response.url + "/Page-2" else: m = re.match("(http://www.newegg.com/.*Page-)[0-9]+", response.url) if m: next_url = m.group(1) + str(page) else: self.log("Error: not ok url " + response.url + " , page " + str(page), level=log.WARNING) return yield Request(url = next_url, callback = self.parsePage, meta = {'page' : page}) #print 'next url ', next_url
def parsePage(self, response): hxs = HtmlXPathSelector(response) items = [] products = hxs.select("//div[@class='shortDescription']") for product in products: item = ProductItem() item['product_url'] = product.select("a/@href").extract()[0] items.append(item) return items
def parsePage_macys(self, response): hxs = HtmlXPathSelector(response) root_url = "http://www1.macys.com" # extract product URLs product_links = hxs.select("//div[@class='shortDescription']/a/@href") for product_link in product_links: item = ProductItem() item['product_url'] = root_url + product_link.extract() yield item
def parsePage(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//li[@class='product-cell ']/a") items = [] for product in products: item = ProductItem() item['product_url'] = product.select("@href").extract()[0] items.append(item) return items
def parseBsPage(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='zg_itemImmersion']") for product in products: item = ProductItem() url = product.select( "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/@href" ).extract() if url: item['product_url'] = url[0].strip() yield item
def parsePage(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='shortDescription']/a") items = [] root_url = "http://www1.macys.com" for product in products: item = ProductItem() item['product_url'] = root_url + product.select("@href").extract()[0] items.append(item) return items
def parseBrand(self, response): hxs = HtmlXPathSelector(response) # category of items on current page category = response.meta['category'] # set parameters in meta specifying current product count and total product count for this brand # to be used for deciding on stop criteria on pagination if 'total_product_count' in response.meta: product_count = response.meta['total_product_count'] cur_product_count = response.meta['current_product_count'] else: # extract number of products for this brand product_count = int( hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0]) cur_product_count = 0 # extract products from this page product_links = hxs.select( "//h3[@class='productTitle']/a/@href").extract() # add domain product_urls = map(lambda x: Utils.add_domain(x, self.base_url), product_links) for product_url in product_urls: item = ProductItem() # remove parameters in url item['product_url'] = Utils.clean_url(product_url) item['category'] = category yield item # add nr of extracted products to current product count cur_product_count += len(product_urls) # get next page if any next_page = self.build_next_page_url(response.url, product_count, cur_product_count, first=('total_product_count' not in response.meta)) if next_page: yield Request(url=next_page, callback=self.parseBrand, meta={ 'total_product_count': product_count, 'current_product_count': cur_product_count, 'category': category })
def parsePage(self, response): hxs = HtmlXPathSelector(response) root_url = "http://www.walmart.com" product_links = hxs.select("//a[@class='prodLink ListItemLink']/@href") for product_link in product_links: item = ProductItem() item['product_url'] = root_url + product_link.extract() yield item # select next page, if any, parse it too with this method next_page = hxs.select("//a[@class='link-pageNum' and text()=' Next ']/@href").extract() if next_page: page_url = root_url + next_page[0] yield Request(url = page_url, callback = self.parsePage)
def parse(self, response): boxes = self._scrape_product_boxes(response) if boxes is None: # No products are shown here, go deeper into subcategories. for request in map(Request, self._scrape_subcategories(response)): yield request else: # Scrape product links. category_name = self._scrape_category_name(response) for url in map(self._scrape_product_link, boxes): yield ProductItem(product_url=url, category=category_name) # Go to next page, if availible. url = response.css('a.next.i-next::attr(href)') if url: yield Request(url.extract()[0])
def parsePage(self, response): hxs = HtmlXPathSelector(response) root_url = "http://shop.nordstrom.com" # extract product URLs product_links = hxs.select("//div/a[@class='title']/@href") for product_link in product_links: item = ProductItem() item['product_url'] = root_url + product_link.extract() yield item # select next page, if any, parse it too with this method next_page = hxs.select( "//ul[@class='arrows']/li[@class='next']/a/@href").extract() if next_page: page_url = next_page[0] yield Request(url=page_url, callback=self.parsePage)
def parsePage_bestbuy(self, response): hxs = HtmlXPathSelector(response) root_url = "http://www.bestbuy.com" # extract product URLs product_links = hxs.select("//div[@class='info-main']/h3/a/@href") for product_link in product_links: item = ProductItem() item['product_url'] = root_url + product_link.extract() yield item # select next page, if any, parse it too with this method next_page = hxs.select( "//ul[@class='pagination']/li/a[@class='next']/@href").extract() if next_page: page_url = root_url + next_page[0] yield Request(url=page_url, callback=self.parsePage_bestbuy)
def parseBrandPage(self, response): hxs = HtmlXPathSelector(response) # category of items on this page category = response.meta['category'] # extract item count if 'item_count' in response.meta: total_item_count = reponse.meta['item_count'] else: total_item_count = int( hxs.select("//p[@id='filtered-products-count']").re("[0-9]+") [0]) # extract product holder. not extracting <a> element directly because each product holder has many a elements (all just as good, but we only want one) product_holders = hxs.select("//div[@class='product ']") for product_holder in product_holders: # extract first link in product holder product_link = product_holder.select(".//a/@href").extract()[0] product_url = Utils.add_domain(product_link, self.base_url) item = ProductItem() item['product_url'] = product_url item['category'] = category yield item # crawl next pages if any left if 'offset' not in response.meta: offset = 0 else: offset = response.meta['offset'] next_page = self.build_next_page_url(response.url, total_item_count, offset) # if there are more products to crawl, send new request if next_page: yield Request(url=next_page, callback=self.parseBrandPage, meta={ 'offset': offset + 1, 'total_item_count': total_item_count, 'category': category })
def parsePage_tigerdirect(self, response): hxs = HtmlXPathSelector(response) #print "IN PARSEPAGE ", response.url # without the "resultsWrap" div, these are found on pages we don't want as well product_links = hxs.select( "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href" ).extract() for product_link in product_links: item = ProductItem() item['product_url'] = Utils.add_domain( product_link, "http://www.tigerdirect.com") # remove CatId from URL (generates duplicates) m = re.match("(.*)&CatId=[0-9]+", item['product_url']) if m: item['product_url'] = m.group(1) yield item # parse next pages (if results spread on more than 1 page) #TODO: not sure if all of them are extracted next_page = hxs.select("//a[@title='Next page']") if next_page: #print "next page : ", response.url, " + ", next_page page_nr = response.meta['page'] + 1 # base_url = response.meta['base_url'] # # remove trailing "&" character at the end of the URL # m = re.match("(.*)&", base_url) # if m: # base_url = m.group(1) # yield Request(url = base_url + "&page=%d"%page_nr, callback = self.parsePage_tigerdirect,\ # meta = {'page' : page_nr, 'base_url' : response.meta['base_url']}) next_page_url = Utils.add_domain( next_page.select("@href").extract()[0], "http://www.tigerdirect.com") yield Request(url = next_page_url, callback = self.parsePage_tigerdirect,\ meta = {'page' : page_nr}) # if you can't find product links, you should search for links to the subcategories pages and parse them for product links if not product_links: yield Request(url=response.url, callback=self.parseSubcats_tigerdirect)
def parseBrandPage(self, response): hxs = HtmlXPathSelector(response) # category of items on these page category = response.meta['category'] # extract product urls # only select from t1 tab (also see build_url... on omitting t4) product_urls = hxs.select("//div[contains(@id,'t1')]//div[@class='pl_productName']/a/@href") for url in product_urls: item = ProductItem() product_url = url.extract().encode("utf-8") # convert strange product url characters to percent encoding - only part after last / (the one before the last, since the last is followed by nothing) product_url = '/'.join(product_url.split('/')[:-2]) + '/' + urllib.quote(product_url.split('/')[-2]) + '/' item['product_url'] = product_url item['category'] = category yield item # crawl next pages if any # find if there is a next page # select maximum page number on the page available_pages = map(lambda x: int(x), hxs.select("//div[contains(@id,'t1')]//div[@class='pagination']/ul/li/a/text()").re("[0-9]+")) if available_pages: max_page = max(available_pages) else: max_page = 0 # extract 'next page' link # only select from t1 tab (also see build_url... on omitting t4) next_page_link = hxs.select("//div[contains(@id,'t1')]//li[@class='next']/a") if next_page_link: # extract js call to next page, use it to build the next page url js_call_string = self.extract_boots_js_args(next_page_link.select("@href").extract()[0].encode("utf-8")) next_page = self.build_boots_param_url(response.url, *(js_call_string), max_page=max_page) # if there is no next page, function will return None if next_page: yield Request(url = next_page, callback = self.parseBrandPage, meta = {'category' : category})
def parsePage(self, response): hxs = HtmlXPathSelector(response) #print 'title parsepage ', hxs.select("//h1/text()").extract() products = hxs.select("//a[@class='url']") root_url = "http://www.staples.com" for product in products: item = ProductItem() item['product_url'] = root_url + product.select( "@href").extract()[0] yield item #yield items nextPage = hxs.select("//li[@class='pageNext']/a/@href").extract() zipcode = "12345" if nextPage: # parse next page, (first convert url from unicode to string) yield Request(str(nextPage[0]), callback = self.parsePage, cookies = {"zipcode" : zipcode}, \ headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True})
def parsePage(self, response): hxs = HtmlXPathSelector(response) product_links = hxs.select("//h3[@class='newaps']/a/@href") for product_link in product_links: item = ProductItem() item['product_url'] = product_link.extract() yield item # select next page, if any, parse it too with this method root_url = "http://www.amazon.com" next_page = hxs.select("//a[@title='Next Page']/@href").extract() if next_page: page_url = root_url + next_page[0] yield Request(url=page_url, callback=self.parsePage) # if no products were found, maybe this was a bestsellers page if not product_links: yield Request(response.url, callback=self.parseBsPage) # get next pages as well page_urls = hxs.select( "//div[@id='zg_paginationWrapper']//a/@href").extract() for page_url in page_urls: yield Request(page_url, callback=self.parseBsPage)