def extract_result_products(self, response): hxs = HtmlXPathSelector(response) items = [] results = hxs.select( "//div[@class='list-item-info']/div[@class='sku-title']/h4/a") for result in results: item = SearchItem() #item['origin_site'] = site product_name_holder = result.select("text()").extract() if product_name_holder: item['product_name'] = product_name_holder[0].strip() else: self.log("Error: No product name: " + str(response.url) + " from product: " + origin_url, level=log.ERROR) item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_name' in response.meta: item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] model_holder = result.select( "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] price_holder = result.select( "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]" ).extract() if price_holder: price = price_holder[0].strip() price = re.sub(",", "", price) price = float(price) item['product_target_price'] = price items.append(item) return items
def parseBrand(self, response): hxs = HtmlXPathSelector(response) # category of items on current page category = response.meta['category'] # set parameters in meta specifying current product count and total product count for this brand # to be used for deciding on stop criteria on pagination if 'total_product_count' in response.meta: product_count = response.meta['total_product_count'] cur_product_count = response.meta['current_product_count'] else: # extract number of products for this brand product_count = int( hxs.select("//h2[@id='productCount']//text()").re("[0-9]+")[0]) cur_product_count = 0 # extract products from this page product_links = hxs.select( "//h3[@class='productTitle']/a/@href").extract() # add domain product_urls = map(lambda x: Utils.add_domain(x, self.base_url), product_links) for product_url in product_urls: item = ProductItem() # remove parameters in url item['product_url'] = Utils.clean_url(product_url) item['category'] = category yield item # add nr of extracted products to current product count cur_product_count += len(product_urls) # get next page if any next_page = self.build_next_page_url(response.url, product_count, cur_product_count, first=('total_product_count' not in response.meta)) if next_page: yield Request(url=next_page, callback=self.parseBrand, meta={ 'total_product_count': product_count, 'current_product_count': cur_product_count, 'category': category })
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//div[@class='hproduct']/div[@class='info-main']/h3/a") for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select("text()").extract()[0].strip() item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id model_holder = result.select( "parent::node()/parent::node()//strong[@itemprop='model']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select("//h3[@class='productTitle']/a") for result in results: item = SearchItem() product_url = result.select("@href").extract()[0] # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text()) # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't product_name_node = result.select("strong/abbr/@title") product_name = product_name_node.extract( )[0] if product_name_node else result.select( "strong/text()").extract()[0] # assert name is not abbreviated assert '...' not in product_name # add product quantity product_quantity_node = result.select( "text()[normalize-space()!='']") product_quantity = product_quantity_node.extract()[0].strip( ) if product_quantity_node else "" product_name_full = product_name + " " + product_quantity #print "ITEM", product_name # quit if there is no product name if product_name and product_url: # clean url item['product_url'] = Utils.add_domain( Utils.clean_url(product_url), self.base_url) item['product_name'] = product_name_full else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # add url, name and model of product to be matched (from origin site) item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product model from name product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted #TODO: extract: price, brand? # add result to items items.add(item) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not # if there are, reduceResults will send the next one back here, if not it will return the final result response.meta['items'] = items # and field 'parsed' to indicate that the call was received from this method (was not the initial one) #TODO: do we still need this? response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def build_url(self, url): url = Utils.add_domain(url, self.BASE_URL) url = Utils.clean_url(url, ['#']) return url