def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) try: item['product_name'] = hxs.select( "//h2[@class='product_name product_title']/span[@itemprop='name']/text()" ).extract()[0] except: self.log("Error: No product name: " + str(response.url) + " from product: " + item['origin_url'], level=log.INFO) # ignore products with no name return price = hxs.select( "//div[@class='price']/text() | //div[@class='pricestring']/text()" ).extract()[0].strip() if price.startswith("from "): price = price[5:].strip() m = re.match("(\xa3|\$)([0-9]+\.?[0-9]*)", price) if not m: self.log("Didn't match product price: " + price + " " + response.url + "\n", level=log.WARNING) else: price_amount = m.group(2) price_currency = m.group(1) price_value = Utils.convert_to_dollars(float(price_amount), price_currency) item['product_target_price'] = price_value return item
def extract_result_products(self, response): hxs = HtmlXPathSelector(response) results = hxs.select("//div[@class='innerWrapper']") items = [] for result in results: item = SearchItem() product_name = result.select( ".//div[@class='shortDescription']/a/text()").extract() product_url = result.select( ".//div[@class='shortDescription']/a/@href").extract() # quit if there is no product name if product_name and product_url: item['product_url'] = "http://www1.macys.com" + product_url[0] item['product_name'] = product_name[0].strip() else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # extract price #! extracting regular price and not discount price when discounts available? price_holder = result.select( "div[@class='prices']/span/text()").extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) # if more than one match, it will get the first one m = re.match("([a-zA-Z\.\s]+)?(\xa3|\$)([0-9]+\.?[0-9]*)", product_target_price) if m: price = float(m.group(3)) currency = m.group(2) item['product_target_price'] = Utils.convert_to_dollars( price, currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.DEBUG) # extract product brand # items.append(item) return items
def parse_product_maplin(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] if 'origin_upc' in response.meta: item['origin_upc'] = response.meta['origin_upc'] if 'origin_brand' in response.meta: item['origin_brand'] = response.meta['origin_brand'] product_name_node = hxs.select("//h1[@itemprop='name']/text()").extract() if product_name_node: product_name = product_name_node[0].strip() else: self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR) # TODO:is this ok? I think so return item['product_name'] = product_name # extract product model number # TODO: no model? # TODO: no upc? # TODO: no brand? # TODO: add code extraction # extract price price_holder = hxs.select("//meta[@itemprop='price']/@content").extract() # if we can't find it like above try other things: if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",","",product_target_price) try: product_target_price = float(product_target_price) # convert to dollars (assume pounds) product_target_price = Utils.convert_to_dollars(product_target_price, u'\xa3') item['product_target_price'] = product_target_price except Exception, ex: self.log("Couldn't convert product price: " + response.url + "\n", level=log.WARNING)
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) try: item['product_name'] = hxs.xpath( "//h1[@class='product-title']/text()").extract()[0] except: self.log("Error: No product name: " + str(response.url) + " from product: " + item['origin_url'], level=log.INFO) # ignore products with no name return price_node = hxs.select("//p[@class='price']") if price_node: try: price_amount = price_node.select( "span[@itemprop='price']/text()").extract()[0] price_currency = price_node.select( "span[@class='smaller']/text()").extract()[0] price_amount = re.sub(",", "", price_amount) m1 = re.match("[0-9]+\.?[0-9]*", price_amount) m2 = re.match("(\xa3)|(\$)", price_currency) if not m1 or not m2: self.log("Didn't match product price: " + price_amount + price_currency + " " + response.url + "\n", level=log.WARNING) else: price = Utils.convert_to_dollars(float(price_amount), price_currency) item['product_target_price'] = price except Exception: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: item['product_model'] = hxs.select( "//strong[@itemprop='mpn']/text()").extract()[0] except Exception: pass try: item['product_brand'] = hxs.select( "//div[@itemprop='manufacturer']/meta/@content").extract()[0] except Exception: pass return item
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) try: item['product_name'] = hxs.xpath( "//h1[@itemprop='name']/text()").extract()[0] except: self.log("Error: No product name: " + str(response.url) + " from product: " + item['origin_url'], level=log.INFO) # ignore products with no name return try: price = hxs.select( "//p[@class='price']/span[@itemprop='price']/text()").extract( )[0] price = re.sub(",", "", price) m = re.match("(\xa3|\$)([0-9]+\.?[0-9]*)", price) if not m: self.log("Didn't match product price: " + price_amount + price_currency + " " + response.url + "\n", level=log.WARNING) else: price_amount = m.group(2) price_currency = m.group(1) price_value = Utils.convert_to_dollars(float(price_amount), price_currency) item['product_target_price'] = price_value except Exception: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: item['product_model'] = hxs.select("//div[@id='product_additional_details_container']" + \ "//tr[starts-with(.//text()[normalize-space()], 'Model')]/td/text()")\ .extract()[0].strip() except Exception: pass try: item['product_brand'] = hxs.select("//div[@id='product_additional_details_container']" + \ "//tr[starts-with(.//text()[normalize-space()], 'Brand')]/td/text()")\ .extract()[0].strip() except Exception: pass return item
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) # extract product name product_name = hxs.select("//h1[@id='itemTitle']/text()").extract() if not product_name: self.log("Error: No product name: " + str(response.url), level=log.INFO) else: item['product_name'] = product_name[0] # extract product brand product_brand_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Brand')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if product_brand_holder: item['product_brand'] = product_brand_holder[0] # extract product model product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Model')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if not product_model_holder: product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'MPN')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if product_model_holder: item['product_model'] = product_model_holder[0] # TODO: upc? price_holder = hxs.select( "//span[@itemprop='price']/text() | //span[@id='mm-saleDscPrc']/text()" ) try: (currency, price) = price_holder.re("(\$|\xa3)([0-9\.]+)") if currency != "$": price = Utils.convert_to_dollars(float(price), currency) item['product_target_price'] = float(price) except: self.log("No price: " + str(response.url), level=log.WARNING) return item
def parse_product_currys(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] if 'origin_upc' in response.meta: item['origin_upc'] = response.meta['origin_upc'] if 'origin_brand' in response.meta: item['origin_brand'] = response.meta['origin_brand'] product_name_node = hxs.select("//span[@itemprop='name']/text()").extract() if product_name_node: product_name = product_name_node[0].strip() else: self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR) # TODO:is this ok? I think so return item['product_name'] = product_name # extract product model number # TODO: no model? # TODO: no upc? brand_holder = hxs.select("//span[@itemprop='name']/text()").extract() if brand_holder: item['product_brand'] = brand_holder[0] # extract price price_holder = hxs.select("//span[@class='currentPrice']/ins/text()").extract() # if we can't find it like above try other things: if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",","",product_target_price) m = re.match("(\xa3)([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(2)) currency = m.group(1) item['product_target_price'] = Utils.convert_to_dollars(item['product_target_price'], currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) # add result to items items.add(item) product_urls = response.meta['search_results'] # try to send request to parse next product, try until url for next product url is valid (response not 404) # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost # find first valid next product url next_product_url = None if product_urls: next_product_url = product_urls.pop() # if a next product url was found, send new request back to parse_product_url if next_product_url: request = Request(next_product_url, callback = self.parse_product_currys, meta = response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls return request # if no next valid product url was found else: # we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parse_product_amazon(self, response): hxs = HtmlXPathSelector(response) origin_product_id = response.meta['origin_product_id'] current_query = response.meta['query'] origin_url = self.results[origin_product_id]['origin_product'][ 'origin_url'] item = SearchItem() item['product_url'] = response.url for field in self.results[origin_product_id]['origin_product'].keys(): item[field] = self.results[origin_product_id]['origin_product'][ field] # all product urls from all queries items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \ self.results[origin_product_id]['search_requests']), []) # all product urls from all queries product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \ self.results[origin_product_id]['search_requests']), []) product_urls = set(product_urls) #TODO: to test this #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract()) product_name_node = hxs.select( '//h1[@id="title"]/span[@id="productTitle"]/text()').extract() product_name = None if not product_name_node: product_name_node = hxs.select( '//h1[@id="aiv-content-title"]//text()').extract() if not product_name_node: product_name_node = hxs.select( '//div[@id="title_feature_div"]/h1//text()').extract() if product_name_node: product_name = product_name_node[0].strip() else: # needs special treatment product_name_node = hxs.select( '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()' ).extract() if product_name_node: product_name = " ".join(product_name_node).strip() if not product_name: # log this error: # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error if 'captcha_retries' in response.meta and response.meta[ 'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES: self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.WARNING) else: # if it comes from a solved captcha page, then it's an error if it's still not found self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.ERROR) # try this: don't remove captcha_retries from meta, may cause infinite loops, works # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES: # del response.meta['captcha_retries'] # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list) # if we haven't reached maximum retries, try again if 'captcha_retries' not in response.meta \ or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES: # assume there is a captcha to crack # check if there is a form on the page - that means it's probably the captcha form forms = hxs.select("//form") if forms: # solve captcha captcha_text = None image = hxs.select(".//img/@src").extract() if image: captcha_text = self.CB.solve_captcha(image[0]) # value to use if there was an exception if not captcha_text: captcha_text = '' # create a FormRequest to this same URL, with everything needed in meta # items, cookies and search_urls not changed from previous response so no need to set them again # redo the entire request (no items will be lost) meta = response.meta # flag indicating how many times we already retried to solve captcha if 'captcha_retries' in meta: meta['captcha_retries'] += 1 else: meta['captcha_retries'] = 1 return [ FormRequest.from_response( response, callback=self.parse_product_amazon, formdata={'field-keywords': captcha_text}, meta=meta) ] else: item['product_name'] = product_name # extract product model number model_number_holder = hxs.select( """//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() | //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()""" ).extract() if model_number_holder: item['product_model'] = model_number_holder[0].strip() # if no product model explicitly on the page, try to extract it from name else: product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8") upc_node = hxs.select( "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()" ).extract() if upc_node: upc = upc_node[0].strip().split() item['product_upc'] = upc manufacturer_code_node = hxs.select( "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()" ).extract() if manufacturer_code_node: manufacturer_code = manufacturer_code_node[0].strip() item['manufacturer_code'] = manufacturer_code try: # for lowest level category: # TODO: test the xpath for the second type of page (see second type of xpath for top-level category) # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \ # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0] # for top-level category: bestsellers_rank = hxs.select( "//tr[@id='SalesRank']/td[@class='value']/text()" + " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0] item['bestsellers_rank'] = int( re.sub(",", "", "".join(bestsellers_rank[1:]))) except Exception, e: if self.output == 6 or self.bestsellers_link: self.log("Didn't find product rank: " + str(e) + " " + response.url + "\n", level=log.INFO) asin_node = hxs.select( "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()" ).extract() if asin_node: item['product_asin'] = asin_node[0].strip() brand_holder = hxs.select( "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()" ).extract() if brand_holder: item['product_brand'] = brand_holder[0] else: pass #sys.stderr.write("Didn't find product brand: " + response.url + "\n") # extract price #! extracting list price and not discount price when discounts available? price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \ "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract() # if we can't find it like above try other things: if not price_holder: # prefer new prices to used ones # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often price_holder = hxs.select( "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]" ).extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(2)) currency = m.group(1) if currency != "$": item[ 'product_target_price'] = Utils.convert_to_dollars( item['product_target_price'], currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: item['product_category_tree'] = \ filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract())) except: pass try: item['product_keywords'] = hxs.select( "//meta[@name='keywords']/@content").extract()[0] except: pass try: product_image = hxs.select( "//img[@id='landingImage']/@src").extract()[0] item['product_image_url'] = product_image item['product_image_encoded'] = ProcessText.encode_image( product_image) except: pass # add result to items self.results[origin_product_id]['search_requests'][current_query][ 'product_items'].append(item)
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) try: item['product_name'] = hxs.xpath( "//h1[starts-with(@class,'title')]//text()").extract( )[0].strip() except: try: item['product_name'] = hxs.xpath( "//div[@class='pdp_title']//text()[normalize-space()!='']" ).extract()[0].strip() except: try: item['product_name'] = hxs.xpath( "//h1//text()").extract()[0].strip() except: # out of stock products return 404s with this text, not the actual product page out_of_stock = hxs.xpath( "//strong[contains(text(),'out of stock')]").extract() if not out_of_stock: self.log("Error: No product name: " + str(response.url) + " from product: " + item['origin_url'], level=log.ERROR) # ignore products with no name return None price_node = hxs.select("//meta[@itemprop='price']/@content").extract() if price_node: try: price_currency = price_node[0][0] price_amount = "".join(price_node[0][1:]) price_amount = re.sub(",", "", price_amount) m1 = re.match("[0-9]+\.?[0-9]*", price_amount) m2 = re.match("(\xa3)|(\$)", price_currency) if not m1 or not m2: self.log("Didn't match product price: " + price_amount + price_currency + " " + response.url + "\n", level=log.WARNING) else: price = Utils.convert_to_dollars(float(price_amount), price_currency) item['product_target_price'] = price except Exception: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: product_model_node = hxs.select( "//div[@class='prod_description1']//li[contains(text(), 'Style')]/text()" ).re("[sS]tyle +[nN]o\.? +[a-zA-Z0-9]+") item['product_model'] = re.match( "[sS]tyle +[nN]o\.? +([a-zA-Z0-9]+)", product_model_node[0]).group(1) except Exception: pass try: item['product_brand'] = hxs.select( "//meta[@itemprop='brand']/@content").extract()[0] except Exception: pass try: js_body = hxs.select( "//script[contains(text(),'Upc')]/text()").extract()[0] item['product_upc'] = re.match('.*"skuUpcCode":"([0-9a-zA-Z]+)".*', js_body, re.DOTALL | re.MULTILINE).group(1) except Exception: pass return item