def parseDepartment(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='zg_itemImmersion']") items = [] for product in products: item = ProductItem() #TODO: the index for the title is sometimes out of range - sometimes it can't find that tag (remove the [0] to debug) list_name = product.select( "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/text()" ).extract() if list_name: item['list_name'] = list_name[0] else: # if there's no product name don't include this product in the list, move on to the next continue url = product.select( "div[@class='zg_itemWrapper']//div[@class='zg_title']/a/@href" ).extract() if url: item['url'] = url[0].strip() else: # if there's no product url don't include this product in the list, move on to the next # one of the products in Lawn & Garden is missing a name and url, also in Sports & Outdoors continue #TODO: this needs to be refined, many prices etc. extract all prices? new, used etc prices = product.select( "div[@class='zg_itemWrapper']//div[@class='zg_price']") price = prices.select("strong[@class='price']/text()").extract() listprice = prices.select( "span[@class='listprice']/text()").extract() # some of the items don't have a price if price: item['price'] = price[0] # some items don't have a "list price" if listprice: item['listprice'] = listprice[0] # extract rank and ignore last character of the string (it's .) item['rank'] = product.select( ".//span[@class='zg_rankNumber']/text()").extract()[0][:-1] #dept_name = hxs.select("//ul[@id='zg_browseRoot']//span[@class='zg_selected']/text()").extract()[0].strip() item['department'] = response.meta['dept_name'] #dept_name # add url of bestsellers page this was found on item['bspage_url'] = response.url # pass the item to the parseProduct function to extract info from product page request = Request(item['url'], callback=self.parseProduct) request.meta['item'] = item yield request
def parse(self, response): hxs = HtmlXPathSelector(response) #TODO: !! select currency # extract tabs and their corresponding departments tabs = hxs.select("//ul[@id='tab-set']/li/a") departments = {} for tab in tabs: department_name = tab.select("text()").extract()[0] tab_id = tab.select("@href").extract()[0].replace("#", "") departments[tab_id] = department_name # for each deparment extract products from corresponding tab for tab in departments: department = departments[tab] # in compound output the Jewelry department is missing because it is a duplicate of Watches products = hxs.select("//div[@id='%s']/div[@class='OProduct']" % tab) # counter to keep track of products rank rank = 0 for product in products: item = ProductItem() item['department'] = department rank += 1 item['rank'] = str(rank) product_link = product.select(".//div[@class='Oname']/a") product_name = product_link.select("text()").extract() product_url = product_link.select("@href").extract() if product_name: item['list_name'] = product_name[0].strip() if product_url: item['url'] = product_url[0] # if there's no url move on to next product else: continue #TODO: change price to USD price = product.select( ".//div[@class='Oprice']/span[@class='Ovalue']/span[@class='Ovalue']/text()" ).extract() if price: item['price'] = price[0] # pass the item to the parseProduct method request = Request(item['url'], callback=self.parseProduct) request.meta['item'] = item yield request
def parsePage(self, response): #TODO: add department? hxs = HtmlXPathSelector(response) products = hxs.select("//li[@class='productbox']") products_per_page = 40 page_nr = response.meta['page'] # counter to keep track of product rank rank = 0 for product in products: item = ProductItem() rank += 1 product_link = product.select(".//a[@class='toplink']") url = product_link.select("@href").extract() if url: item['url'] = url[0] else: continue item['SKU'] = product.select("@data-sku").extract()[0] # compute global item rank using rank on current page and page number item['rank'] = str((page_nr - 1) * products_per_page + rank) product_name = product_link.select( "div[@class='prodname']/text()").extract() brand_name = product_link.select( "div[@class='prodname']/div[@class='prodbrandname emphasis']/text()" ).extract() if product_name: item['list_name'] = product_name[0].strip() if brand_name: item['brand'] = brand_name[0].strip() #TODO: also "Reg price", extract that as well? listprice = product.select( ".//div[@class='wasprice']/span/text()").extract() if listprice: item['listprice'] = listprice[0] price = product.select( ".//div[@class='price secondarytext midtitle']/text() | .//div[@class='price noticetext midtitle']/text()" ).extract() if price: item['price'] = price[0] # add date item['date'] = datetime.date.today().isoformat() # pass item to parseProduct method request = Request(item['url'], callback=self.parseProduct) request.meta['item'] = item yield request
def _scrape_product_links(self, response): parent_elts = response.css('#cat_bestSellers .item') for parent in parent_elts: product = ProductItem() url = parent.css('.product-name a::attr(href)').extract()[0] product['list_name'] = parent.css('.product-name a::text').extract()[0] price = parent.css('.special-price .price::text') listprice = parent.css('.old-price .price::text') price = price or parent.css('.price')[0].css('::text') try: listprice = listprice or parent.css('#old-price-')[0].css('::text') except IndexError: listprice = price product['price'] = ''.join(price.extract()).strip() product['listprice'] = ''.join(listprice.extract()).strip() yield url, product
def parsePage(self, response, department): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='productThumbnail showQuickView']") if not products: return # counter to hold rank of product rank = 0 for product in products: item = ProductItem() rank += 1 item['rank'] = str(rank) # get item department from response's meta item['department'] = department # extract name and url from bestsellers list product_link = product.select("div[@class='shortDescription']/a") name = product_link.select("text()").extract() if name: item['list_name'] = name[0] url = product_link.select("@href").extract() if url: item['url'] = url[0] # if there's no url move on to next product else: continue #TODO: add net price? # price = product.select(".//div[@class='prices']//span[@class='priceBig']/text()").extract() # if price: # item['price'] = price[0] # call parseProduct method on each product] request = Request(item['url'], callback=self.parseProduct) request.meta['item'] = item yield request
def parsePage(self, response): hxs = HtmlXPathSelector(response) root_url = "http://www.bestbuy.com" products_per_page = 15 max_products = self.max_products root_url = "http://www.bestbuy.com" # find page number by adding 1 to the previous one if 'page_nr' not in response.meta: page_nr = 1 else: page_nr = response.meta['page_nr'] + 1 # find product rank using page number and number of items per page rank = (page_nr - 1) * 15 products = hxs.select("//div[@class='hproduct']") for product in products: item = ProductItem() rank += 1 item['rank'] = str(rank) product_link = product.select("div[@class='info-main']/h3/a") item['list_name'] = product_link.select( "text()").extract()[0].strip() url = product_link.select("@href").extract() if url: item['url'] = root_url + url[0] item['department'] = response.meta['department'] item['category'] = response.meta['category'] item['bspage_url'] = response.url item['date'] = datetime.date.today().isoformat() item['SKU'] = product.select( ".//strong[@class='sku']/text()").extract()[0] #TODO: extract product model? saleprice = product.select( "div[@class='info-side']/div/h4[@class='price sale']/span/text()" ).extract() if saleprice: item['price'] = saleprice[0] # regular price regprice = product.select( "div[@class='info-side']/h4[@class='price regular']/span/text()" ).extract() if regprice: item['regprice'] = regprice[0] if rank > max_products: break if not url: yield item else: # send this product page to be parsed by parseProduct # ! duplicates are removed: products that are in more than one category will appear in only one of them #TODO: include duplicates if they are from different categories? yield Request(item['url'], callback=self.parseProduct, meta={"item": item}) # select next page, if any, parse it too with this method if rank < max_products: next_page = hxs.select( "//ul[@class='pagination']/li/a[@class='next']/@href").extract( ) if next_page: page_url = root_url + next_page[0] request = Request(url=page_url, callback=self.parsePage) request.meta['department'] = response.meta['department'] request.meta['category'] = response.meta['category'] request.meta['page_nr'] = page_nr yield request
def parsePage(self, response): hxs = HtmlXPathSelector(response) # products in overall bestsellers list products = hxs.select("//div[@class='prodloop_cont']") # products in by-department bestsellers lists products2 = hxs.select("//div[@class='topSellersView']") # department name if any (for department-wise bestsellers pages) dept_name = "" #TODO: some items don't have the department field. check in nodepts_toysrus.txt department = hxs.select("//div[@id='breadCrumbs']/text()").extract() if department: # remove part before > and ignore first character from div content dept_name = department[0].split(">")[-1][1:].strip() # keep counter to set rank of product rank = 0 for product in products: item = ProductItem() rank += 1 item['rank'] = str(rank) # get product name in bestsellers list page name = product.select("a[@class='prodtitle']/text()").extract() item['list_name'] = name[0] # get relative url of product page and add its root prefix root_url = "http://www.toysrus.com" url = product.select("a[@class='prodtitle']/@href").extract() if url: item['url'] = root_url + url[0] # if there's no url move on to the next product else: continue # get price ("our price") price = product.select("div[@class='prodPrice familyPrices']/span[@class='ourPrice2']/text()").extract() if price: item['price'] = price[0] # get list price listprice = product.select("div[@class='prodPrice familyPrices']/span[@class='listPrice2']/text()").extract() if listprice: item['listprice'] = listprice[0] # send the item to be parsed by parseProduct request = Request(item['url'], callback = self.parseProduct) request.meta['item'] = item yield request for product in products2: item = ProductItem() name = product.select(".//li[@class='productTitle']/a/text()").extract() item['list_name'] = name[0] root_url = "http://www.toysrus.com" url = product.select(".//li[@class='productTitle']/a/@href").extract() if url: item['url'] = root_url + url[0] # if there's no url move on to the next product else: continue if dept_name: item['department'] = dept_name # eliminate final . from rank item['rank'] = product.select(".//div[@class='itemNumber']/text()").extract()[0][:-1] # add bestsellers page product was found on as a field item['bspage_url'] = response.url # get price ("our price") price = product.select(".//li[@class='prodPrice familyPrices']/span[@class='ourPrice2']/text()").extract() if price: item['price'] = price[0] # get list price listprice = product.select(".//li[@class='prodPrice familyPrices']/span[@class='listPrice2']/text()").extract() if listprice: item['listprice'] = listprice[0] # send the item to be parsed by parseProduct request = Request(item['url'], callback = self.parseProduct) request.meta['item'] = item yield request
def parseDepartment(self, response): # some of the products are duplicates across departments, they will only appear once on the final list hxs = HtmlXPathSelector(response) department = response.meta['department'] #TODO: what if there is pagination? haven't encountered it so far products = hxs.select("//div[@class='prodInfo']") # counter to keep track of product's rank rank = 0 for product in products: item = ProductItem() # if inspect option was activated, add info on the context of the product element on the page if self.inspect: item['prod_context'] = product.select( "ancestor::*[1]").extract() rank += 1 item['rank'] = str(rank) product_link = product.select( "div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']") product_name = product_link.select("text()").extract() product_url = product_link.select("@href").extract() if product_name: item['list_name'] = product_name[0] if product_url: item['url'] = self.root_url + product_url[0] else: # if there's no url move on to the next product continue item['department'] = department #TODO: some of the products have the "From" prefix before the price, should I include that? price_div = product.select( ".//div[@class='camelPrice'] | .//span[@class='camelPrice']") price1 = price_div.select( "span[@class='bigPriceText2']/text()").extract() price2 = price_div.select( "span[@class='smallPriceText2']/text()").extract() if price1 and price2: item['price'] = price1[0] + price2[0] #TODO: include out of stock products? : else: price1 = price_div.select( "span[@class='bigPriceTextOutStock2']/text()").extract() price2 = price_div.select( "span[@class='smallPriceTextOutStock2']/text()").extract() if price1 and price2: item['price'] = price1[0] + price2[0] #TODO: are list prices always retrieved correctly? listprice = product.select( ".//div[@class='PriceMLtgry']/text").extract() if listprice: item['listprice'] = listprice[0] item['bspage_url'] = response.url # pass the item to the parseProduct method request = Request(item['url'], callback=self.parseProduct) request.meta['item'] = item yield request