def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select("//div[@class='product-overview']/h1[1]/text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/strong/text()").extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = price[0] price2 = hxs.select("//div[@class='product-overview']/div/p/span[@class='price']/text()").extract() if price2: price += price2[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_search(self, response): hxs = HtmlXPathSelector(response) items = hxs.select("//div[@id='content']/div[@id='searchcontent']/div[@class='hasResults']/form/div[@id='switchview']/ol/li") for item in items: name = item.select("ul/li[@class='producttitle']/h4/a[1]/text()").extract() if not name: continue name = name[0] url = item.select("ul/li[@class='producttitle']/h4/a[1]/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] price = item.select("ul/li[contains(@class, 'pricing')]/ul/li[contains(@class, 'price')]/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//h1/text()').extract()[0] multiple_prices = hxs.select('//select[@class="smalltextblk"]/option/text()').extract() single_special_price = hxs.select('//span/text()').re('\xa3(.*[0-9]+)') single_price = hxs.select('//td[@class="ProductPrice"]/text()').re('\xa3(.*[0-9])') products_data = [] if single_price and not multiple_prices: price = single_price[0] if not single_special_price else single_special_price[0] products_data.append((name, price)) else: multiple_prices = multiple_prices[1:] for name_and_price in multiple_prices: name_and_price = re.match('(.*)\xa3(.*\.[0-9]+)', name_and_price).groups() products_data.append((name + ' ' + name_and_price[0], name_and_price[1])) for item in products_data: product = Product() loader = ProductLoader(item=product, response=response) # try: loader.add_value('url', response.url) loader.add_value('name', item[0]) loader.add_value('price', item[1]) loader.add_value('sku', '') yield loader.load_item()
def parse_table_options_type2_single_product_page(self, response): hxs = HtmlXPathSelector(response) name = hxs.select("//div[@id='mainContent']/center/table/tr[1]/td[1]/p[2][not(@class)][*[local-name()='strong']]/strong[1]//text()").extract() if not name: logging.error("ERROR!! NO NAME!! %s" % (response.url, )) return name = name[0] subproducts = hxs.select("//div[@id='mainContent']/center/table//table[@class='product_body']/tr[position()>1]") for product_el in subproducts: add_name = product_el.select("td[1]//text()").extract() if not add_name: logging.error("ERROR!! NO NAME!! %s" % (response.url, )) continue add_name = add_name[0] url = response.url price = product_el.select('td[3]//text()').extract() if not price: logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url)) continue price = price[0] if re.search(prices_range_regex, price): yield Request(url, callback=self.parse_product_list) continue product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', "%s %s" % (name, add_name)) loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item()
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) items = hxs.select("//div[@class='navArea']/div[@class='navAreaPagging fr']/span[@class='paggingBtnNext']/a/@href").extract() for item in items: yield Request(urljoin_rfc(base_url,item), callback=self.parse) content = hxs.select("//div[@class='mainProducts']") products = content.select(".//a") for product_ in products: name = product_.select(".//ul/li/span[@class='productName']/text()").extract() url = product_.select(".//@href").extract() price = product_.select(".//ul//li/ul/li[1]/span[@class='orange']/text()").re(r'\xa3(.*)') if not price: price = product_.select(".//ul/li/ul/li[1]/span[@class='gray']/text()").re(r'\xa3(.*)') if name: l = ProductLoader(item=Product(), response=response) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) l.load_item() yield l.load_item() """content = hxs.select("//div[@class='mainProducts']")
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) # multiple prices name = hxs.select('//h1/text()').extract()[0] multiple_prices = hxs.select('//option/text()').extract() single_price = hxs.select('//span/b/text()').re('\xa3(.*)') products_data = [] if not single_price: for name_and_price in multiple_prices: # try: name_and_price = re.sub('[\t\r\n]', '', name_and_price).strip() products_data.append(re.match('(.*[0-9,a-z,A-Z\)]).*\xa3(.*[0-9])', name_and_price).groups()) # except AttributeError: # continue else: price = single_price[0] products_data.append((name, price), ) for item in products_data: product = Product() loader = ProductLoader(item=product, response=response) # try: loader.add_value('url', response.url) loader.add_value('name', item[0]) loader.add_value('price', item[1]) loader.add_value('sku', '') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url #name = hxs.select('//div[@id="package_showcase"]/div[@id="description"]/h1/text()').extract() name = hxs.select('//h1[@itemprop="name"]/text()').extract() if not name: print "ERROR!! NO NAME!! %s" % url return name = name[0] #price = hxs.select('//div[@id="package_showcase"]/div[@id="pricing"]/strong[last()]/text()').extract() price = hxs.select('//span[@itemprop="price"]/text()').extract() if not price: print "ERROR!! NO PRICE!! %s" % url return price = price[-1] product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', response.url.split('/')[-2]) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select("//h1[@class='pageTitle']/span/text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = " ".join(name) name = re.sub("[\s]+", " ", name) price = hxs.select("//div[contains(@class, 'productDetail')]//span[contains(@class, 'currentPrice')]/text()").extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_search(self, response): hxs = HtmlXPathSelector(response) # parse pages pages = hxs.select("//ul[@class='pagination']//a/@href").extract() for page in pages: if page != '#': request = Request(page, callback=self.parse_search) yield request # parse products items = hxs.select("//article[contains(@class, 'product')]/div[contains(@class, 'desc')]") for item in items: name = item.select(".//div/header[@class='productTitle']/a/text()").extract() if not name: continue name = name[0].strip() name = re.sub("[\s]+", " ", name) url = item.select(".//div/header[@class='productTitle']/a/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] price = item.select(".//div//span[@class='currentPrice']/ins/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0].strip() l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_options(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select('//div[@id="skuinfo"]/h1[@itemprop="name"]/text()').extract() if not name: name = hxs.select('//div[@class="details"]/h1/text()').extract() price = "".join(hxs.select('//div[@class="club"]/span[@itemprop="Price"]/text()').re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(hxs.select('//div[@class="details"]/div[@class="special"]/text()').re(r'([0-9\,\. ]+)')).strip() specs = hxs.select('//div[@id="specs"]/div/p[@class="specs"]') model_no = None for spec in specs: try: spec_text = spec.select('./span/text()').extract()[0] if spec_text == 'Mfg Part #:': model_no = "".join(spec.select("./text()").extract()).strip() except: continue if name and price: sku_ = '' if model_no: csv_file = UnicodeReader(open(os.path.join(HERE, 'skus.csv'))) for row in csv_file: if row[3] == model_no: sku_ = row[0] break product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name[0]) product_loader.add_value('sku', sku_) product_loader.add_value('price', price) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse_item(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select("//tr[@id='ProductDetail11_trProductName']/td/text()").extract() if name: name = name[0].strip() url = response.url price = hxs.select("//tr[@id='ProductDetail11_trCustomPrice']/td/font/b/text()").extract() if not price: price = hxs.select("//tr[@id='ProductDetail11_trPrice']/td/text()").extract() l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item() else: # may be several products products = hxs.select("//table[@id='SearchTemplate13_DataGrid1']// \ table[@id='SearchTemplate13_DataGrid1__ctl3_ProductInfoTable']") for product in products: url = product.select("//tr[@id='SearchTemplate13_DataGrid1__ctl3_ProductNameRow']/td/a/@href").extract() if url: yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_item)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return # sub products hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="content-box"]/div[contains(@class,"item")]') for item in products: product = Product() price = item.select('.//div[@class="item-price"]').extract() url = item.select('.//div[@class="moreinfo"]/a/@href').extract()[0] url = urljoin_rfc(self.URLBASE, url) if not price: yield Request(url) else: loader = ProductLoader(item=product, response=response) try: loader.add_value('url', url) name = item.select('.//div[@class="item-name"]/a/text()').extract()[0] loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item() except IndexError: continue
def parse_search(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # parse pages pages = hxs.select("//div[@class='pagination top']//a/@href").extract() for page in pages: request = Request(urljoin_rfc(base_url, page), callback=self.parse_search) yield request # parse products items = hxs.select("//div[@class='search-result']/form/ul/li") for item in items: name = item.select("div[@class='prd-infos']/a/p[@class='prd-name']/strong/text()").extract() if not name: continue name = name[0] url = item.select("div[@class='prd-infos']/a/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] price = item.select("div[@class='prd-actions']/p[@class='prd-amount']/strong/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) items = hxs.select("//table[@id='ProductDataList']/tr/td[div[contains(@id, 'ModelLinkCell')]]") for item in items: name = item.select(".//a[contains(@id, 'ModelLink')]//text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % response.url) return name = "".join(name) url = item.select(".//a[contains(@id, 'ModelLink')]/@href").extract() if not url: logging.error("ERROR! NO URL! %s %s" % (name, response.url)) return url = urljoin_rfc(base_url, url[0]) price = item.select("div[contains(@id, 'ModelPrice')]//td[@class='Label11']/text()").re(u'\xa3(.*)') if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@class='product-shop']/div[@class='product-name']/h2/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] # adding product price = hxs.select("//div[@class='product-shop']/div[@class='price-box']//span[@class='price']/text()").extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0].replace(".", "").replace(",", ".") # price_delivery = hxs.select("//div[@class='product-shop']//table[@id='product-attribute-specs-table']/tr/td[(preceding::th[text()='Spese Spedizione'])]/text()").extract() # if not price_delivery: # logging.error("NO PRICE DELIVERY! %s" % url) # return # price_delivery = price_delivery[0] # price = Decimal(price) + Decimal(price_delivery) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) url = response.url sku = response.meta['sku'] name = hxs.select("//h1[contains(@class, 'parseasinTitle')]/span/text()").extract() if not name: logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url)) return name = name[0].strip() price = hxs.select("//table[@class='product']//span[@id='actualPriceValue']/b/text()").extract() if not price: logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url)) return price = price[0].strip() product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item()
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0] price = Decimal(extract_price2uk(price)) eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract() if eco_tax: eco_tax[0] = eco_tax[0].encode("ascii", "ignore") print "Found eco tax %s" % eco_tax[0] price -= Decimal(extract_price2uk(eco_tax[0])) l = ProductLoader(item=Product(), response=response) l.add_value("identifier", str(name)) l.add_value("name", name) l.add_value("url", url) l.add_value("price", unicode(price)) yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select("//div[@id='product-content']//div[@id='product-header']/h1//text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = " ".join(name) price = hxs.select( "//div[@id='product-content']//div[@id='productPrice']//p[@id='product-price']/text()" ).extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value("identifier", name) l.add_value("name", name) l.add_value("url", url) l.add_value("price", price) yield l.load_item()
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@id='pageContentSub']/div[@class='moduleBox']/\ div[@id='top_breadcrumb_link']/a[last()]/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] # adding product price = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\ div[@class='content']/div[@class='details']/ul/li[1]/span[2]/text()").re(u'€ (.*)') if not price: logging.error("NO PRICE! %s" % url) return price = price[0].replace(",", "") price_delivery = hxs.select("//div[@id='pageContentSub']/form/div[@class='moduleBox']/\ div[@class='content']/div[@class='details']/ul/li[2]/span[2]/text()").re(u'€ (.*)') if not price_delivery: logging.error("NO PRICE DELIVERY! %s" % url) return price_delivery = price_delivery[0].replace(",", "") price = Decimal(price) + Decimal(price_delivery) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_search(self, response): hxs = HtmlXPathSelector(response) # parse pages pages = hxs.select("//ul[@id='pagination']/li/a/@href").extract() for page in pages: request = Request(page, callback=self.parse_search) yield request # parse products items = hxs.select("//div[@class='column_one grid_list']/div") for item in items: name = item.select("div/div[@class='info']/div/h2/a/text()").extract() if not name: continue name = name[0] url = item.select("div/div[@class='info']/div/h2/a/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] price = item.select("div/div[@class='pricebox']/p[@id='product-price']/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value("identifier", name) l.add_value("name", name) l.add_value("url", url) l.add_value("price", price) yield l.load_item()
def parse_product_list_columns(self, response): hxs = HtmlXPathSelector(response) products_count = hxs.select("count(//table[@class='product_body']/tr[3]/td)").extract()[0] for i in range(1, int(float(products_count))+1): name = hxs.select("//table[@class='product_body']/tr[3]/td[%d]/p//text()" % i).extract() if not name: logging.error("ERROR!! NO NAME!! %s" % (response.url, )) continue name = name[0] url = response.url price = hxs.select("//table[@class='product_body']/tr[4]/td[%d]/p[1]/strong[last()]//text()" % i).extract() if not price: logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url)) continue price = price[0] if re.search(prices_range_regex, price): yield Request(url, callback=self.parse_product_list) continue product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select( "//form[@id='handleBuy']/div[@class='buying']/h1[@class='parseasinTitle']/span/text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select("//div[@id='priceBlock']//tr[@id='actualPriceRow']//b[@class='priceLarge']/text()").extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = price[0] description = u'' l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_several_products_single_product_page(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//table[@class='product_body']/tr/td[2]/p[not(@class)][*[local-name()='strong']]") for product_el in products: name = product_el.select("strong[1]//text()").extract() if not name: logging.error("ERROR!! NO NAME!! %s" % (response.url, )) continue name = name[0] url = response.url price = product_el.select('strong[2]/text() | b[last()]/text()').extract() if not price: logging.error("ERROR!! NO PRICE!! %s %s" % (name, response.url)) continue price = price[0] if re.search(prices_range_regex, price): yield Request(url, callback=self.parse_product_list) continue product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select(u'//div[@class="rightcol"]//td[contains(child::text(),"\xa3")] | //div[@class="rightcol"]//td[child::h1]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './a/text()') product_loader.add_xpath('name', './h1/text()') url = product.select('./a/@href').extract() if not url: url = response.url else: url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) price = product.select('./text()').re('\xa3(.*)') if not price: price = product.select('.//span[@id="_EKM_PRODUCTPRICE"]/text()').extract() if not price: continue product_loader.add_value('price', price) yield product_loader.load_item()
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//h1[@class='product-name']/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] # adding product price = hxs.select("//div[@class='price-box']//span[@class='price']/text()").re(u'€ (.*)') if not price: logging.error("NO PRICE! %s" % url) return price = price[0].replace(".", "").replace(",", ".") price_delivery = hxs.select("//div[@class='product-shop']/\ text()[(preceding::div[@class='price-box']) and (following::div[@class='add-to-holder'])]" ).re(u'€\xa0([\d,.]*)') if not price_delivery: logging.error("NO PRICE DELIVERY! %s" % url) return price_delivery = price_delivery[0].replace(".", "").replace(",", ".") price = Decimal(price) + Decimal(price_delivery) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name.encode("ascii", "ignore")) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select("//div[@id='productDetail']/form/fieldset/h2/text()").extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\ span[@class='pounds']/text()").extract() if not price: price = hxs.select("//div[@id='productDetail']/form/fieldset/div[@class='price']/span[@class='productPrice']/\ span[@class='newPrice']/text()").extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = "".join(price) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.findAll('a', href=re.compile('ProductDetail')) products = {product.parent.parent for product in products} for product in products: product_loader = ProductLoader(item=Product(), response=response) name = product.findAll('font')[1].text price = product.find('nobr', text=re.compile('\$')) url = product.find('a', href=re.compile('ProductDetail')) if url: url = urljoin_rfc(get_base_url(response), url['href']) else: url = response.url product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('url', url) product_loader.add_value('sku', response.meta['sku']) #product_loader.add_value('identifier', response.meta['sku']) site_mfrgid = product.find('nobr').text if site_mfrgid: site_mfrgid = site_mfrgid.strip().lower() mfrgid = response.meta['mfrgid'].strip().lower() if site_mfrgid == mfrgid: yield product_loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select("//div[@id='dvWrapControl732']//a/@href").extract() for link in categories: url = urljoin_rfc(base_url, link) yield Request(url, callback=self.parse) items = hxs.select("//table[@class='ProductGroup']/tr[@class='ProductGroupItem'] |\ //table[@class='ProductGroup']/tr[@class='ProductGroupAlternatingItem']") for item in items: name = item.select("td[@id='tdProductGroupDisplayDescription']/div/font | \ td[@id='tdProductGroupDisplayAltDescription']/div/font").extract() if not name: print "%s - ERROR! NO NAME!" % response.url continue name = replace_tags(name[0]) url = response.url price = item.select("td[@id='tdProductGroupDisplayPricing']//text() | \ td[@id='tdProductGroupDisplayAltPricing']//text()").extract() if not price: print "%s - ERROR! NO PRICE!" % response.url continue price = price[0].split(',')[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', unicode(name).encode('ascii', 'ignore')) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse(self, response): URL_BASE = get_base_url(response) #categories hxs = HtmlXPathSelector(response) categories_title = hxs.select('//div[@id="secondNav"]/div[@class="catList"]/dl/dt[1]/text()').extract() if categories_title and categories_title[0].strip().lower() == "by category": categories = hxs.select('//div[@id="secondNav"]/div[@class="catList"]/dl/dd') for link in categories: url = link.select(".//a/@href").extract()[0] url = urljoin_rfc(URL_BASE, url) url += self.pager_url_arguments yield Request(url) link_class = link.select("@class").extract() if link_class and link_class[0] == "last": break pages_urls = hxs.select('//span[@clas="pagingTools"]/a/@href').extract() for url in pages_urls: url = urljoin_rfc(URL_BASE, url) yield Request(url) products_els = hxs.select('//li[@class="galleryProduct"]') for product_el in products_els: name = product_el.select('div[@class="galleryContainer"]/a/span/text()').extract() if not name: print "ERROR!! NO NAME!! %s" % response.url continue name = name[0].split(" - Home Delivered") url = product_el.select('div[@class="galleryContainer"]/a/@href').extract() if not url: print "ERROR!! NO URL!! %s" % response.url continue url = url[0] url = urljoin_rfc(URL_BASE, url) price = product_el.select( 'div[@class="galleryContainer"]/div[@class="productInfo"]/\ div[@class="productPriceBlock"]/p/span[@class="nowPrice"]/strong/text() |\ div[@class="galleryContainer"]/div[@class="productInfo"]/\ div[@class="productPriceBlock"]/p/span[@class="onlyPrice"]/text()' ).extract() if not price: print "ERROR!! NO PRICE!! %s" % response.url continue price = price[0] product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', '') yield loader.load_item()
def parse_mattel_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product_name = hxs.select( '//div[@class="product-details"]/h2/text()').extract() if not product_name: return product_name = product_name[0] product_price = hxs.select( '//*[@id="product-information"]//span[@class="promotion-now"]/text()' ).extract()[0] product_identifier = response.url.partition('productId=')[2] brand = 'Mattel' image_url = hxs.select('//*[@id="mainProductImage"]/@src').extract() category = response.meta.get('category') sku = hxs.select('//span[@class="item-number"]/text()').extract() sku = sku[0].replace('Item #: ', '') a = re.search(r'skus: {\s+(.*)},\s+availableSizes', response.body, re.DOTALL | re.IGNORECASE) a = '{' + a.groups()[0].strip() + '}' a = a.replace("'", '"') lines = a.split('\n') result = '' for line in lines: if ': "' in line: for field in mattel_fields: if field + ':' in line: result += line.replace(field, '"' + field + '"') break else: result += line options = json.loads(result) for option_id, option in options.iteritems(): loader = ProductLoader(response=response, item=Product()) identifier = product_identifier + '_' + option_id loader.add_value('identifier', identifier) price = option.get('price').strip() if price == '': price = product_price price = extract_price(price) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('url', response.url) name = product_name if option.get('color').lower().strip() != 'one color': name += ', ' + option.get('color') if option.get('size').lower().strip() not in [ 'one size', 'one style' ]: name += ', ' + option.get('size') loader.add_value('name', name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('category', category) if price > 35: loader.add_value('shipping_cost', 0) yield loader.load_item()
def parse(self, response): #inspect_response(response, self) #return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select( '//div[@id="col2"]//input[contains(@id,"product_minidetail_")]/@value' ).extract() if tmp: loader.add_value('identifier', tmp[0]) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) return #tmp = hxs.select('//input[@name="productId"]/@value').extract() #if tmp: # loader.add_value('sku', tmp[0]) name = '' tmp = hxs.select( '//div[@id="col2"]//h1[@class="titre"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) else: log.msg('### No name at ' + response.url, level=log.INFO) #price price = 0 stock = 0 tmp = hxs.select( '//div[@id="col2"]//span[@class="prix"]/text()').extract() if tmp: price = extract_price(tmp[0].strip()) loader.add_value('price', price) #stock = 1 #stock #stock = 0 tmp = hxs.select('//div[@id="col2"]//span[text()="Add to basket"]') if tmp: stock = 1 loader.add_value('stock', stock) #image_url tmp = hxs.select( '//div[@id="col1"]//div[contains(@class,"product")]/img/@src' ).extract() if tmp: url = urljoin(response.url, tmp[0].strip()) loader.add_value('image_url', url) #brand tmp = hxs.select( '//div[@id="col2"]//td[@class="catName"]/a/text()').extract() if tmp: loader.add_value('brand', tmp[0].upper()) #category tmp = hxs.select('//div[@id="breadcrumb"]/h2/a/text()').extract() if tmp: for s in tmp: loader.add_value('category', s) #shipping_cost if price <= 26: loader.add_value('shipping_cost', 3.6) #elif price<50: # loader.add_value('shipping_cost', 5.95) product = loader.load_item() metadata = YMeta() tmp = hxs.select( '//div[@id="col2"]//div[@class="promo"]/img/@alt').extract() if tmp: metadata['promotions'] = [] for s in tmp: s = s.replace('picto-', '') metadata['promotions'].append(s) metadata['promotions'] = ','.join(metadata['promotions']) if metadata: product['metadata'] = metadata return product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) product_id = response.xpath( '//form[@name="SelectProductForm"]/input[@name="product_id"]/@value' ).extract() if product_id: identifier = product_id[0] else: identifier = response.url.split('/')[-1] loader.add_value('identifier', identifier.split('?')[0]) name = filter( lambda n: n, map(unicode.strip, response.xpath('//h1[@itemprop="name"]//text()').extract())) if not name: name = filter( lambda n: n, map( unicode.strip, response.xpath( '//h1[contains(@class,"product-name")]//text()'). extract())) if name: loader.add_value('name', name[0].strip()) loader.add_value('brand', response.meta['brand']) categories = response.xpath( '//div[@itemprop="breadcrumb"]//span[@itemprop="title"]/text()' ).extract() if not categories: categories = response.xpath( '//div[@itemprop="breadcrumb"]//span[@itemprop="name"]/text()' ).extract() if categories: loader.add_value('category', categories[0]) elif 'category' in response.meta: loader.add_value('category', response.meta['category']) sku = response.xpath( '//table[@class="SpecTable"]//td[text()="Model No.:"]/following-sibling::td/text()' ).extract() if not sku: sku = response.xpath( '//div[@class="specs-table"]//td[text()="Model No.:"]/following-sibling::td/text()' ).extract() if not sku: sku = response.xpath( '//div[contains(@class, "specs-table")]//td[text()="Model No.:"]/following-sibling::td/text()' ).extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_value('url', response.url) price = response.xpath( '//div[@id="WM_PRICE"]//*[contains(@class,"camelPrice")]/span/text()' ).extract() if not price: price = response.xpath( '//div[@class="onlinePriceMP"]//*[contains(@class,"camelPrice")]/span/text()' ).extract() if not price: price = response.xpath( '//div[@itemprop="offers"]/div[contains(@class, "product-price")]//*[@itemprop="price"][1]//text()' ).extract() if not price: price = response.xpath( '//div[@class="col5"]//div[contains(@class,"product-buying-table-row")][1]//div[contains(@class,"price-display")][1]//text()' ).extract() if not price: price = response.xpath('//*[@itemprop="price"]//text()').extract() if not price: price = response.xpath('//@data-product-price').extract_first() price = [price] if price else [] price = ''.join(price).strip() if price else '0' price = extract_price(price) loader.add_value('price', price) out_stock = response.xpath( '//div[@id="OnlineStat" and @class="OutOfStock"]') if not out_stock: out_stock = response.xpath( '//p[@class="price-oos" and text()="Out of stock"]') if not out_stock: out_stock = response.xpath( '//div[@id="OnlineStat" and @class="OnlineNotSold"]') if out_stock: loader.add_value('stock', 0) else: loader.add_value('stock', 1) image = response.xpath( '//div[@class="LargeItemPhoto215"]//img/@src').extract() if not image: image = response.xpath( '//div[contains(@class,"product-images")][1]//img/@src' ).extract() if image: loader.add_value('image_url', image[0]) product = loader.load_item() metadata = HamiltonMeta() metadata['brand'] = product['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata if True: productid = response.url.split('/')[-1].split('.') url = 'https://www.walmart.com/reviews/product/%s?page=1' % productid[ 0] yield Request(url, meta={ 'product': product, 'page': 1, 'productid': productid[0] }, callback=self.parse_reviews) else: yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = hxs.select('//div[@id="bCrumb"]/span/a/text()').extract() category = category[-1] if category else response.meta.get( 'category', '') colours = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value!="0"]/@value' ).extract() no_option_selected = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value="0" and @selected]/@value' ) if colours and no_option_selected: for colour in colours: formdata = {} inputs = hxs.select('//form[@id="frmMain"]//input') for input in inputs: name = ''.join(input.select('@name').extract()) value = ''.join(input.select('@value').extract()) formdata[name] = value formdata['ctl00$cphMain$ddlColour'] = colour form_url = hxs.select( '//form[@id="frmMain"]/@action').extract()[0] yield FormRequest(form_url, dont_filter=True, method='POST', formdata=formdata, callback=self.parse_product, meta={ 'category': category, 'colour': colour }) return sizes = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@value!="0"]/@value' ).extract() no_option_selected = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@value="0" and @selected]') if sizes and no_option_selected: for size in sizes: formdata = {} inputs = hxs.select('//form[@id="frmMain"]//input') for input in inputs: name = ''.join(input.select('@name').extract()) value = ''.join(input.select('@value').extract()) formdata[name] = value formdata['ctl00$cphMain$ddlSize'] = size colour = response.meta.get('colour', None) if colour: formdata['ctl00$cphMain$ddlColour'] = colour form_url = hxs.select( '//form[@id="frmMain"]/@action').extract()[0] yield FormRequest(form_url, dont_filter=True, method='POST', formdata=formdata, callback=self.parse_product, meta={ 'category': category, 'formdata': formdata }) return loader = ProductLoader(item=Product(), selector=hxs) identifier = hxs.select('//div[@class="code"]/text()').extract()[0] loader.add_xpath('sku', '//div[@class="code"]/text()') loader.add_value('url', response.url) product_name = hxs.select( '//div[@class="title"]//h1/text()').extract()[0] colour = hxs.select( '//span[@id="cphMain_lblSelectedColour"]/b/text()').extract() if colour: product_name = product_name + ' - ' + colour[0].strip() loader.add_value('category', category) img = hxs.select('//img[@id="cphMain_imgThumb"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_xpath('brand', '//span[@class="brand"]/text()') loader.add_value('stock', '1') if loader.get_output_value('price') < 50.00: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') price = hxs.select('//span[@class="price"]/text()').extract() if colours or sizes: colour = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@selected and @value!="0"]' ) option_price = None if colour: colour_id = colour.select('@value').extract()[0] colour_desc = colour.select('text()').extract()[0] identifier = identifier + '-' + colour_id product_name = product_name + ' - ' + colour_desc.split( u' - \xa3')[0].strip() option_price = re.search(r"\xa3(\d+.\d+)", colour_desc) size = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@selected and @value!="0"]' ) if size: size_id = size.select('@value').extract()[0] size_desc = size.select('text()').extract()[0].strip() identifier = identifier + '-' + size_id colour = hxs.select( '//span[@id="cphMain_lblSelectedColour"]/b/text()' ).extract() product_name = product_name + ' - ' + size_desc loader.add_value('identifier', identifier) loader.add_value('name', product_name.replace(' - Collect Only', '')) if option_price: loader.add_value('price', option_price.group(1)) else: loader.add_value('price', price) else: loader.add_value('identifier', identifier) loader.add_value('name', product_name.replace(' - Collect Only', '')) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url #name = hxs.select('//div[@id="package_showcase"]/div[@id="description"]/h1/text()').extract() name = hxs.select('//h1[@itemprop="name"]/text()').extract() if not name: print "ERROR!! NO NAME!! %s" % url return name = name[0] #price = hxs.select('//div[@id="package_showcase"]/div[@id="pricing"]/strong[last()]/text()').extract() price = hxs.select('//span[@itemprop="price"]/text()').extract() if not price: print "ERROR!! NO PRICE!! %s" % url return price = price[-1] product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', response.url.split('/')[-2]) loader.add_value('identifier', response.url.split('/')[-2]) yield loader.load_item()
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #image_url = hxs.select('//div[contains(@class, "b-main-image")]/a/@href').extract() image_url = hxs.select( '//img[@itemprop="image"]/@data-frz-src').extract() product_identifier = hxs.select( '//input[@name="sku"]/@value').extract()[0] product_name = hxs.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() price = hxs.select( '//div[@id="auto_show_prime_price"]/strong/span[contains(@class, "actualPrice")]/text()' ).extract()[0] price = extract_price(price) category = hxs.select( '//ul[@class="b-breadcrumb"]//a/text()').extract()[1:] brand = hxs.select('//span[@itemprop="brand"]/text()').extract() brand = brand[0] if brand else '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', image_url[0]) product_loader.add_value('sku', product_identifier) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = "".join( map( lambda x: x.strip(), hxs.select( '//div[@id="primary_block"]/h1/descendant-or-self::text()' ).extract())) if name.startswith("LEGOLAND"): return category = hxs.select('//div[@class="breadcrumb "]/a/text()').extract() if category: category = category.pop() else: category = "" pid = hxs.select('//input[@name="id_product"]/@value').extract() sku = hxs.select( '//label[@for="product_reference"]/following-sibling::span[1]/text()' ).extract() if not sku: sku = pid elif sku[0].endswith("-lego"): sku = sku.pop()[0:-5] try: price = self.parse_price( hxs.select('//p[@class="our_price_display"]/strong/span/text()' ).pop().extract()) except IndexError: return stock = hxs.select('//p[@id="pQuantityAvailable"]/span[@class="yes"]') if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//div[@id="image-block"]/span/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//*[@id="content"]//h3/text()').extract()[0].strip() price = hxs.select( '//*[@id="content"]/div[1]/div[2]/p[4]/text()').extract() price = hxs.select('//*[@id="content"]/div[1]/div[2]/p[3]/text()' ).extract() if not price else price price = extract_price(price[0].strip().replace( u' K\u010d', '').replace(',', '.').replace(' ', '')) sku = hxs.select( "//p[contains(text(),'Objednac') and contains(text(),'slo:')]/following::p[1]/text()" ).extract()[0] sku = sku[2:] if sku.startswith('22') else sku identifier = hxs.select( '//div[@class="detail-koupit"]/form/@action').extract()[0] identifier = identifier.partition('volba=')[2] availability = hxs.select( '//*[@id="content"]/div[1]/div[2]/div[1]/img/@alt').extract( )[0].strip() category = hxs.select('//*[@id="content"]/h2/text()').extract() image_url = 'http://www.mikifun.cz' + hxs.select( '//div[@id="content"]//a[@class="highslide"]/img/@src').extract( )[0] loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('brand', 'LEGO') if category: loader.add_value('category', category[0]) if availability != 'Skladem': loader.add_value('stock', 0) if int(price) <= 3000: loader.add_value('shipping_cost', 100) yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') product_loader.add_xpath( 'price', '//div[@class="club"]/span[@itemprop="Price"]/text()', re='.*\$(.*[0-9])') product_loader.add_value('url', response.url) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku'].lower()) if not product_loader.get_output_value('price'): return mfrgid = response.meta['mfrgid'] if product_loader.get_output_value('name'): site_mfrgid = hxs.select( u'//p[@class="specs" and child::span[contains(text(),"Mfg Part")]]/text()' ).extract() site_mfrgid = site_mfrgid[1] if len(site_mfrgid) >= 2 else None name = response.meta['name'].split(' ') if site_mfrgid and (mfrgid == site_mfrgid.strip() or site_mfrgid in name): return product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) image_url = response.xpath('//div[@id="productImages"]//img[@itemprop="image"]/@src').extract() product_loader = ProductLoader(item=Product(), response=response) identifier = re.findall('"rid":(.*)};', response.body) if not identifier: return identifier = identifier[0] product_loader.add_value('identifier', identifier) product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()') if image_url: product_loader.add_value('image_url', 'http:' + image_url[0]) product_loader.add_value('sku', identifier) price = response.xpath('//span[@id="productPrice"]/span/text()').extract() if not price: price = response.xpath('//meta[@property="og:price:amount"]/@content').extract() product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', response.meta['categories']) product_loader.add_xpath('brand', '//meta[@property="og:brand"]/@content') out_of_stock = response.xpath('//div[@id="productOptions"]/span[@class="sold-out"]') if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) cats = hxs.select( u'//div[@id="RightColumn"]/table/tr/td/center/div[@class="contentsName"]/a/@href' ).extract() if cats: for url in cats: if url.split('.')[-1].lower() not in ('htm', 'html'): # Contains links to PDFs as well continue url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list) else: opt_groups = [] def fix_options(what, o): try: return (what + ':' + o[0], o[1].replace(',', '')) except: return (what + ':' + o[0], '0') for option in hxs.select(u'//div[@class="eyOptions"]//select'): what = option.select(u'./@name').extract()[0] opt_list = option.select( u'./option[@value!="PleaseSelect" and @value!="Please Select"]/text()' ).extract() opt_list = [o.replace(')', '').split('(') for o in opt_list] opt_groups.append([fix_options(what, o) for o in opt_list]) for opt_name, opt_price in multiply(opt_groups): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') if hxs.select(u'//div[@class="bigSalePrice"]'): product_loader.add_xpath( 'price', u'//div[@class="bigSalePrice"]/span/font/text()') elif hxs.select(u'//span[@class="bigSalePrice"]'): product_loader.add_xpath( 'price', u'//span[@class="bigSalePrice"]/font/text()') else: product_loader.add_xpath( 'price', u'//div[@class="itemRegPrice"]/span/font/text()') product_loader.add_xpath( 'sku', u'normalize-space(substring-after(//div[@class="code"]/text(),":"))' ) product_loader.add_xpath( 'category', u'//div[@class="eyBreadcrumbs"]/a[2]/text()') product_loader.add_xpath('image_url', u'//img[@id="SwitchThisImage"]/@src') # product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")') product_loader.add_value('shipping_cost', '') product = product_loader.load_item() product['name'] = (product['name'] + ' ' + opt_name).strip() product['price'] = product['price'] + Decimal(opt_price) yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_identifier = hxs.select( '//input[@name="product"]/@value')[0].extract() sku = '' product_name = hxs.select( '//div[@class="product-name"]/span/text()')[0].extract().strip() base_price = response.xpath( '//p[@class="special-price"]/span[@class="price"]/text()').extract( ) if not base_price: base_price = response.xpath( '//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() base_price = extract_price(base_price[0]) if base_price else 0 #cart_price = hxs.select('//div[@class="cartBoxTotal"]/text()').extract() image_url = hxs.select('//img[@id="image-main"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' category = hxs.select( '//span[@typeof="v:Breadcrumb"]/a/text()').extract() category = category[-1] if category else '' brand = hxs.select( '//ul[@id="productDetailsList"]/li[contains(text(),"Manufactured")]/text()' ).re('Manufactured by: (.*)') options = hxs.select( '//select[@class=" required-entry product-custom-option"]/option') data_config = response.xpath('//script/text()').re( 'new Product.Config\((.+)\);') if options: for option in options: identifier = option.select('./@value').extract() if not identifier or identifier[0] == '': continue else: identifier = identifier[0] option_name = option.select('./text()').extract()[0] option_name = option_name.split(u'+\xa3')[0].strip() name = product_name + " " + option_name price = extract_price(option.select('@price').extract()[0]) identifier = product_identifier + "-" + identifier loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', identifier) loader.add_value('sku', product_identifier) loader.add_value('price', base_price + price) loader.add_value('brand', '') loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('category', category) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item() return loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('price', base_price) if not loader.get_output_value('price'): loader.add_value('stock', 0) item = loader.load_item() if data_config: data = json.loads(data_config[0])['attributes'] products = dict() for attribute in sorted(data): for option in data[attribute]['options']: for product in option['products']: if not products.get(product): products[product] = dict() products[product]['label'] = option['label'] products[product]['price'] = extract_price( option['price']) else: products[product]['label'] += ' ' + option['label'] products[product]['price'] += extract_price( option['price']) for product in products: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) loader.add_value('name', products[product]['label']) loader.replace_value('identifier', product_identifier + '-' + product) loader.replace_value('sku', product) loader.replace_value('price', base_price + products[product]['price']) yield loader.load_item() return yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//div[@id="ResultSetItems"]'): for x in self.parse(response): yield x return first_name = ' '.join( hxs.select('//*[@id="itemTitle"]/text()').extract()).strip() if not first_name: return identifier = response.url.split('?')[0].split('/')[-1] try: category = hxs.select( '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop() except: category = '' seller_id = ''.join( hxs.select('.//*[@class="si-content"]' '//a/*[@class="mbg-nw"]/text()').extract()) try: brand = hxs.select( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*/text()').extract()[0].strip() except: brand = '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('name', first_name) product_loader.add_value('identifier', identifier) product_loader.add_value('category', category) product_loader.add_value('dealer', 'eBay - ' + seller_id) product_loader.add_value('brand', brand) product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src') product_loader.add_value('url', response.url) try: price = hxs.select( '//*[@id="prcIsum"]/text()').extract()[0].strip() except: try: price = hxs.select( '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip() except: try: price = re.search(r'"binPrice":".*([\d\.,]+)",', response.body).groups()[0] except: price = re.search(r'"bidPrice":".*([\d\.,]+)",', response.body).groups()[0] product_loader.add_value('price', extract_price(price)) # shipping cost try: shipping_cost = hxs.select( '//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): product_loader.add_value('shipping_cost', 0) else: product_loader.add_value('shipping_cost', extract_price(shipping_cost)) except: pass product_ = product_loader.load_item() options_variations = [] sel = HtmlXPathSelector(text=response.body.replace('"', '')) try: json_var_map = unicode( sel.select('//*/text()').re(r'("menuItemMap":{.*}.*),' '"unavailableVariationIds"')[0]) except: pass else: #json_var_map = re.sub(r',"watchCountMessage":".*?}', '}', json_var_map) variations = json.loads( '{' + re.sub(r',"unavailableVariationIds".*', '', json_var_map) + '}') menu_map = variations['menuItemMap'] for key, variation in variations['itemVariationsMap'].items(): if variation['traitValuesMap']: new_variation = {} for option, value in variation['traitValuesMap'].items(): new_variation[option] = menu_map[str( value)]['displayName'] options_variations.append({ 'price': variation['price'], 'values': new_variation, 'identifier': '%s:%s' % (identifier, key) }) if options_variations: for model in options_variations: model_name = first_name + ' ' + \ ' '.join(opt_name.strip().lower() for o, opt_name in model['values'].items()) new_product = Product(product_) new_product['name'] = model_name new_product['identifier'] = model['identifier'] new_product['price'] = extract_price(model['price']) yield new_product else: yield product_
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) options = hxs.select('//select[@id="variant-select-size"]/option[text()!="-- Please select --"]/@value').extract() options += hxs.select('//select[@id="variant-select-colour"]/option[text()!="-- Please select --"]/@value').extract() for option in options: url = urljoin_rfc(base_url, option) yield Request(url, callback=self.parse_product) try: sku = hxs.select('//p[@id="brandAndPartNos"]/text()').extract()[-1].strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return if sku or not options: product_loader = ProductLoader(item=Product(), selector=hxs) product_id = hxs.select('//input[@name="productId"]/@value').extract()[0] name = hxs.select('//h1[@class="skuHeading"]/strong/text()').extract()[0] ext_name = ' '.join(hxs.select('//h1[@class="skuHeading"]/text()').extract()).strip() category = hxs.select('//div[@class="breadcrumb"]/nav/p/a/text()').extract()[-1] image_url = hxs.select('//img[@class="productImageLarge"]/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) brand = hxs.select('//img[@class="brandImageMedium"]/@alt').extract() brand = brand[0].replace(' logo', '') if brand else '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('category', category) product_name = name + ext_name brand_in_name = False for w in re.findall('([a-zA-Z]+)', product_name): if w.upper() in brand.upper(): brand_in_name = True if brand.upper() not in product_name.upper() and not brand_in_name: product_name = brand + ' ' + product_name product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) product_loader.add_value('identifier', product_id) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) discontinued = hxs.select('//p[contains(@class, "stock")]/span[@class="discontinued"]') if discontinued: # Does not include discontinued items return stock = hxs.select('//span[@class="inStock"]/strong/text()').extract() add_button = hxs.select('//input[contains(@class, "ajaxBuyButton")]') if stock: product_loader.add_value('stock', extract_price(stock[0])) elif add_button: product_loader.add_value('stock', 1) else: product_loader.add_value('stock', 0) price = hxs.select('//strong[@id="price_"]/text()').extract()[0] price = extract_price(price) if price < 50: product_loader.add_value('shipping_cost', 4.50) else: product_loader.add_value('shipping_cost', 0) product_loader.add_value('price', price) product_loader.add_value('image_url', image_url) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url name = hxs.select("//div[@class='primary-content']//div[@id='product-summary']/h1/text()").extract() if not name: name = hxs.select('//h1/text()').extract() if not name: logging.error("ERROR! NO NAME! %s" % url) return name = name[0] price = hxs.select("//div[@class='secondary-content']//ul[@class='pricing']/li[@class='current-price']/span/text()").extract() if not price: logging.error("ERROR! NO PRICE! %s %s" % (url, name)) return price = "".join(price) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_products2(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for href in hxs.select( '//table[@id="tblContent"]//td[@class="leftPane"]//a/@href' ).extract(): url = urlparse.urljoin(base_url, href) if url not in self.visited_urls: yield Request(url, callback=self.parse_products2) self.visited_urls.add(url) for href in hxs.select( '//ul[@id="pMenuSublevelsl1"]//a/@href').extract(): url = urlparse.urljoin(base_url, href) if url not in self.visited_urls: yield Request(urlparse.urljoin(base_url, href), callback=self.parse_products2) self.visited_urls.add(url) for product_box in hxs.select( '//div[@id="ShopContent"]//div[@class="plistAreaHeader"]/div'): tabular = product_box.select('.//table[@class="Tabular"]') if tabular: for pbox in tabular.select("./tbody/tr"): product_loader = ProductLoader(item=Product(), selector=pbox) product_loader.add_xpath('name', './td[2]/a/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, pbox.select('./td[2]/a/@href').extract()[0])) product_loader.add_value( 'price', pbox.select('./td[4]/a/text()').extract()[0].split(" ") [-1].replace(".", "").replace(",", ".")) product = product_loader.load_item() if product['url']: yield product continue elements = product_box.select('.//div[@class="prelement"]') if elements: for pbox in elements: product_loader = ProductLoader(item=Product(), selector=pbox) product_loader.add_xpath( 'name', './/div[@class="prmain"]/a[1]/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, pbox.select('.//div[@class="prmain"]/a[1]/@href'). extract()[0])) product_loader.add_value( 'price', pbox.select( './/div[@class="prbasket"]/p[@class="prpri"]/text()' ).extract()[0].split(" ")[-1].replace(".", "").replace( ",", ".")) product = product_loader.load_item() if product['url']: yield product elif product_box.select('.//div[@class="prbasket"]'): product_loader = ProductLoader(item=Product(), selector=product_box) product_loader.add_xpath('name', './a[1]/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, product_box.select('./a[1]/@href').extract()[0])) product_loader.add_value( 'price', product_box.select('.//div[@class="prbasket"]/p/text()'). extract()[0].split(" ")[-1].replace(".", "").replace(",", ".")) product = product_loader.load_item() if product['url']: yield product
def parse_search(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # parse pages pages = hxs.select("//div[@class='pagination-link']//a/@href").extract() for page in pages: request = Request(urljoin_rfc(base_url, page), callback=self.parse_search) yield request # parse products items = hxs.select("//li[contains(@class, 'product')]") for item in items: name = item.select("div[@class='product-details']/div[contains(@class, 'product-name')]/h3/a/text()").extract() if not name: continue name = name[0] url = item.select("div[@class='product-details']/div[contains(@class, 'product-name')]/h3/a/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] url = urljoin_rfc(base_url, url) price = item.select("div[@class='product-details']/div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pounds']/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] price2 = item.select("div[@class='product-details']/div[contains(@class, 'price-spacing')]/p[@class='current-price']/span[@class='pence']/text()").extract() if price2: price += "." + price2[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', name) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def _parse_compound_product(self, response): hxs = HtmlXPathSelector(response) main_name = hxs.select('//h1[@itemprop="Name"]//text()').extract()[0] skus = hxs.select('//div[@id="details"]//b[contains(text(),' + '"Item Numbers")]/following-sibling::text()').extract() if skus: skus = skus[0] skus = [sku.strip() for sku in skus.split(',')] for i, option in enumerate(hxs.select('//select[@id="item_number"]/option[contains(text(), "$")]/text()').extract()): loader = ProductLoader(response=response, item=Product()) name, price = option.split('-') loader.add_value('name', main_name.strip() + ' ' + name.strip()) loader.add_value('price', price) loader.add_value('url', response.url) if len(skus) > i: loader.add_value('sku', skus[i]) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0] brand = 'Lego' try: image_url = urljoin_rfc(base_url, hxs.select('//div[@id="prod-media-player"]' '//img/@src').extract()[0].strip()) except IndexError: image_url = '' options = hxs.select('//div[@id="prod-multi-product-types"]') if options: products = options.select('.//div[@class="product-type"]') for product in products: opt_name = product.select('.//h3/text()').extract()[0].strip() try: stock = product.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), selector=product) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()') loader.add_value('name', '%s %s' % (name, opt_name)) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_xpath('price', './/p[@class="price"]/strong/text()') loader.add_value('stock', stock) yield loader.load_item() else: price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).strip() try: stock = hxs.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), response=response) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()') loader.add_value('name', name) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() if item.get('identifier'): yield item
def parse(self, response): pages = set( response.xpath( '//*[contains(@class, "pagination__item")]/a[not(contains(@class, "pagination__current"))]/@href' ).extract()) for page_url in pages: yield Request(response.urljoin(page_url), meta=response.meta) products = response.xpath( '//article[@itemtype="http://schema.org/Product"]') for product_el in products: loader = ProductLoader(item=Product(), selector=product_el) brand = product_el.xpath( './/*[@itemprop="brand"]//*[@itemprop="name"]/text()').extract( )[0].strip() if brand.upper() in brands_substitute: brand = brands_substitute[brand.upper()] full_name = product_el.xpath( './/*[contains(@class, "product__title") and @itemprop="name"]/text()' ).extract()[0] try: tyre_size, name = re.split(brand, full_name, flags=re.I) except ValueError: self.log( "[[TESTING]] Can not split tyre '%s' with brand '%s'" % (full_name, brand)) continue # tyre_size, name = full_name.split(brand) loader.add_value('name', name) winter_tyre = product_el.xpath( './/*[@class="product__info"]//*[@data-icon="S" and contains(text(), "Winter")]' ) if not winter_tyre: loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = self.get_identifier(product_el) out_of_stock = product_el.xpath( './/*[@itemprop="availability" and contains(@content, "Out")]' ) if out_of_stock: loader.add_value('stock', 0) loader.add_value('url', response.url) image_url = product_el.xpath( './/img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('identifier', identifier) price = product_el.xpath('@data-price').extract()[0] loader.add_value('price', price) metadata = MicheldeverMeta() res = parse_pattern(tyre_size) if not res: continue width, ratio, rim, load_rating, speed_rating = res metadata['aspect_ratio'] = ratio metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['load_rating'] = load_rating metadata['width'] = width metadata['fitting_method'] = 'Fitted' metadata['alternative_speed_rating'] = '' xl = bool( product_el.xpath( './/*[@class="product__info"]//*[@data-icon="XL"]')) metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(full_name) run_flat = bool( product_el.xpath( './/*[@class="product__info"]//*[@data-icon="RF"]')) if not run_flat: run_flat = ' RFT' in name metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' man_code = self._get_manufacturer_code(full_name) metadata['manufacturer_mark'] = man_code metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) try: fuel, grip, noise = product_el.xpath( './/li[contains(@class, "product__meta-item--")]/text()' ).extract() except: fuel, grip, noise = ('', '', '') metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() # The website is defaulting to 2 tyres with a discount of £10 if product.get('price') and (not self.price_discount): product['price'] += Decimal('10') product['metadata'] = metadata if not is_product_correct(product): continue product['metadata'][ 'mts_stock_code'] = self.find_mts_stock_code(product) yield product
def parse_product(response): hxs = HtmlXPathSelector(response) product_name = ''.join( hxs.select('//h1[@itemprop="name"]/text()').extract()).strip() sku = hxs.select('//span[@itemprop="productID"]/text()').extract()[0] img = hxs.select('//img[@class="default-image"]/@src').extract() category = response.meta.get('category') price = hxs.select( '//*[@id="product-price"]//span[@itemprop="price"]/text()' ).extract()[0] price = extract_price(price) brand = hxs.select( '//*[@id="product-right-col"]//span[@itemprop="brand"]/text()' ).extract() brand = brand[0] if brand else '' sizes = hxs.select('//*[@id="100000000045"]//input') colors = hxs.select('//*[@id="100000000046"]//input') if sizes or colors: size_variations = [] for size in sizes: size_id = size.select('./@value').extract()[0] size_name = size.select( './following-sibling::label/span/text()').extract()[0] size_variations.append([size_id, size_name]) color_variations = [] for color in colors: color_id = color.select('./@value').extract()[0] color_name = color.select('./@onclick').extract()[0] color_name = re.findall("(?sim)'(.*?)'", color_name) color_variations.append([color_id, color_name[-1]]) if sizes and colors: options = itertools.product(size_variations, color_variations) else: options = color_variations if colors else size_variations for option in options: product_identifier = sku name = product_name if sizes and colors: for var in option: product_identifier += '_' + var[0] name += ' ' + var[1] else: product_identifier += '_' + option[0] name += ' ' + option[1] loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) price = loader.get_output_value('price') loader.add_value('shipping_cost', get_shipping_cost(price)) loader.add_value('brand', brand) if img: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('price', price) price = loader.get_output_value('price') loader.add_value('shipping_cost', get_shipping_cost(price)) loader.add_value('brand', brand) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item()
def parse_page(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) identifier = hxs.select('//input[@name="productCodePost"]/@value').extract() if not identifier: identifier = hxs.select('//input[@name="productCode"]/@value').extract() active_options = hxs.select('//li[contains(@class, "classification-option") and contains(@class, "active")]/a/text()').extract() active_options += hxs.select('//li[contains(@class, "swat-list-item-active")]/a/@title').extract() item_dict = json.loads(hxs.select('//div[@class="prod-detail"]/@data-ec-product').extract()[0]) sizes = hxs.select('//ul[@class="classification-list var-change-list"]/li[@class="classification-option prod-var-item var-change-item "]/a/text()').extract() price = item_dict['price'] brand = item_dict['brand'] sku = item_dict['id'] image_url = hxs.select('/html/head/meta[@property="og:image"]/@content').extract() if image_url: image_url = image_url[0] try: stock = hxs.select('//div[@itemprop="availability"]/text()').extract()[0] except: stock = hxs.select('//div[@class="in-stock"]/text()').extract()[0] categories_list = hxs.select('//div[@class="breadcrumb hfma"]/ul/li//span[@itemprop="title"]/text()')[1:-1].extract() loader = ProductLoader(item=Product(), response=response) title = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] if active_options: size = hxs.select('//li[contains(@class, "classification-option") and contains(@class, "active")]/a/text()').extract() size = ' '.join(size) if not title.upper().endswith(size.strip().upper()): title += ' ' + size loader.add_value('name', title) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('category', categories_list) loader.add_value('image_url', image_url) loader.add_value('url', response.url) if 'In Stock Online' not in stock: loader.add_value('stock', 0) loader.add_value('identifier', identifier) item = loader.load_item() colors = hxs.select('//div[@class="prod-var-group"]/ul[@class="swat-list swat-list-colour var-change-list"]/li/a/@href').extract() if colors: for color in colors: yield Request(urljoin_rfc(base_url, color), callback=self.parse_page) sizes = hxs.select('//li[contains(@class, "classification-option")]/a/@href').extract() if sizes: for size in sizes: yield Request(urljoin_rfc(base_url, size), callback=self.parse_page) if (colors or sizes) and active_options: yield item if not colors and not sizes: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//a[@id="anchorUnavailable"]'): return sellers_url = 'http://www.rakuten.com/PR/SellerListingsAjax.aspx?sku=%s' name = hxs.select('//div[@id="product-title"]/h1/text()').extract()[0] sku = 0 for item in re.findall("\d+", name): if int(item) > sku: sku = int(item) if sku == 0 or sku < 100: sku = ''.join( hxs.select('//th[contains(text(), "Mfg Part#")]/../td/text()'). extract()).strip() brand = hxs.select( '//th[contains(text(), "Manufacturer")]/../td/a/text()').extract( )[0] category = hxs.select( '//div[@class="product-breadcrumbs"]//a/text()').extract()[-1] image_url = hxs.select('//img[@id="productmain"]/@src').extract() identifier = hxs.select( '//th[contains(text(), "SKU")]/../td/text()').extract()[0] price = hxs.select( '//div[@class="main-price"]/span[@itemprop="price"]/text()' ).extract() price = price[0] if price else 0 shipping = hxs.select( '//div[@class="main-price"]/span[not(@itemprop="price")]/text()' ).extract() shipping = shipping[0] if shipping else 0 sellers = hxs.select( '//div[@id="seller-contact"]//a[@itemprop="seller"]') if sellers: yield Request(sellers_url % identifier, callback=self.parse_sellers, meta={ 'name': name, 'brand': brand, 'category': category, 'identifier': identifier, 'sku': sku, 'image_url': image_url, 'url': response.url }) else: l = ProductLoader(item=Product(), response=response) seller_name = hxs.select( '//a[@id="anchorMarketplaceShipsFrom"]/text()').extract() seller_name = seller_name[0] if seller_name else '' if seller_name: l.add_value('identifier', identifier + '-' + seller_name) else: l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('category', category) l.add_value('brand', brand) l.add_value('sku', sku) l.add_value('url', response.url) l.add_value('price', price) l.add_value('shipping_cost', shipping) l.add_value('image_url', image_url) l.add_value('dealer', 'Rak - ' + seller_name if seller_name else '') yield l.load_item()
def parse_product(self, response): base_url = get_base_url(response) try: name = response.xpath( '//div[@id="name"]/text()').extract()[0].strip() except: name = '' try: desc = response.xpath( '//div[@id="type"]/text()').extract()[0].strip() except: desc = '' if desc: name = name + ' ' + desc loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('shipping_cost', 7.50) loader.add_value('url', response.url) price = response.xpath('//span[@id="price1"]/text()').extract() if price: price = extract_price(price[0]) loader.add_value('price', price) image_url = response.xpath('//img[@id="productImg"]/@src').extract() if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) category = map( unicode.strip, response.xpath( '//ul[@id="breadCrumbs"]//li/a/text()').extract())[1:] category = self._get_unified_category(category, name, price) if category and ('Side tables' in category[-1]): category = ['Tables', 'Side Tables'] loader.add_value('category', category) stock = response.xpath( '//*[@id="dispNotShopableOnlineText"]/@style').extract() if not stock or 'display:none' in stock[0]: stock = 1 else: stock = 0 loader.add_value('stock', stock) loader.add_xpath('sku', '//div[@id="itemNumber"]/text()') loader.add_xpath('identifier', '//div[@id="itemNumber"]/text()') item = loader.load_item() options_data = re.search(r'var jProductData = (.*);', response.body) if options_data: product_data = json.loads(options_data.groups()[0]) for option in product_data['product']['items']: option_item = deepcopy(item) option_item['name'] = option['name'] description = [] description.append(option['type']) description.extend(option['validDesign']) description = ' '.join(description).strip() if description: option_item['name'] += ' ' + description option_text = response.xpath('//option[@value="' + option['catEntryId'] + '"]/text()').extract() option_text = option_text[0].strip() if option_text else '' if option_text and option_text.upper( ) not in option_item['name'].upper(): option_item['name'] += ' ' + option_text url = option['url'] option_item['url'] = urljoin(base_url, url) option_item['price'] = Decimal( option['prices']['normal']['priceNormal']['rawPrice']) identifier = option['partNumber'].replace('S', '') identifier = '.'.join(identifier[i:i + 3] for i in range(0, len(identifier), 3)) option_item['identifier'] = identifier option_item['sku'] = identifier if option_item['identifier']: yield option_item else: if item['identifier']: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) childMap = json.loads( re.search('\'childMap\': (.*),', response.body).group(1)) prices = json.loads( re.search('\'prices\': (.*),', response.body).group(1)) skus = json.loads(re.search('\'skus\': (.*),', response.body).group(1)) stockStatuses = json.loads( re.search('\'stockStatuses\': (.*),', response.body).group(1)) selects = [] for sel in hxs.select('//div[@class="product-options"]//select'): s = [] for opt in sel.select('.//option'): if opt.select('./@value').extract()[0]: s.append(( opt.select('./@value').extract()[0], opt.select('./text()').extract()[0], )) if s: selects.append(s) if not selects: selects = [[('', ''), ('%', '')]] for k, v in list(childMap.items()): if '_%' in k: childMap[k.replace('_%', '')] = v found = False for c in itertools.product(*selects): key = [x[0] for x in c] name = [x[1] for x in c] code = childMap.get('_'.join(key)) if not code: continue code = str(code) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_value('name', name) loader.add_value('sku', skus[code]) loader.add_value('identifier', skus[code]) loader.add_value('price', prices[code][0]['purchase']) loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') if 'In stock' in stockStatuses.get(code, ''): loader.add_value('stock', '1') else: loader.add_value('stock', '0') if loader.get_output_value('price') < 45: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') loader.add_xpath('category', '//div[@class="crumbs"]/a[position()>2]/text()') image_url = hxs.select( '//div[@id="product-image"]//img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) item = loader.load_item() metadata = LeCreusetMeta() item['metadata'] = metadata found = True yield item if not found: self.log('No products on %s' % response.url)
def parse_product(self, response): html = response.body.replace('&', '&') hxs = HtmlXPathSelector(text=html) identifier = hxs.select('//input[@id="pid"]/@value').extract() if not identifier: self.log('PRODUCT WITHOUT IDENTIFIER: ' + response.url) return loader = ProductLoader(item=Product(), response=response) name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] if name.startswith(':'): name = name[1:] loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) categories = hxs.select( '//a[@class="breadcrumb-element"]/@href/../text()').extract() categories.remove('Home') loader.add_value('category', categories) loader.add_value('sku', identifier[0]) loader.add_value('identifier', identifier[0]) loader.add_xpath('image_url', '//div[@class="product-primary-image"]/a/@href') loader.add_xpath( 'price', '//div[@id="product-content"]//span[@class="price-sales"]/meta/@content' ) out_of_stock = hxs.select('//p[contains(@class, "not-available")]') if out_of_stock: loader.add_value('stock', 0) if loader.get_output_value('price') <= 59.99: loader.add_value('shipping_cost', 1.99) item = loader.load_item() if item.get('price', None) and item['price'] <= 59.99: item['shipping_cost'] = 1.99 options = hxs.select('//select') for option in options: for variant in option.select('./option'): if variant.select('./@selected'): var_name = variant.select( './text()').extract()[0].strip().replace('&', '&') item['name'] += ' ' + var_name else: option_url = variant.select( './@value').extract()[0].replace( '&', '&') + '&Quantity=1&uuid=&format=ajax' meta = response.meta meta['item'] = deepcopy(item) meta['base_name'] = name yield Request(option_url, callback=self.parse_option, meta=meta) if item.get('price', None): yield item
def parse_product(self, response): for url in response.xpath('//a[contains(@class,"size-boxes")]/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product) product_name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0] product_image = response.xpath('//a[@id="zoom-btn"]/@href').extract() if product_image: product_image = urljoin_rfc(get_base_url(response), product_image[0]) product_brand = response.xpath("//table[@id='product-attribute-specs-table']/tbody/" "tr[th[text()='Manufacturer']]/td/text()").extract()[0] product_brand = product_brand[0] if product_brand else '' product_config_reg = re.search('var spConfig = new Product.Config\((\{.*\})\);', response.body) product_identifier = response.xpath('//input[@name="product"]/@value').extract()[0] if product_config_reg: products = json.loads(product_config_reg.group(1)) for identifier, product in products['childProducts'].items(): product_loader = ProductLoader(item=Product(), response=response) if identifier: product_loader.add_value('identifier', product_identifier + '-' + identifier) else: product_loader.add_value('identifier', product_identifier) product_loader.add_value('price', product[u'finalPrice']) option_name = product_name for attr_id, attribute in products[u'attributes'].items(): for option in attribute['options']: if identifier in option['products']: option_name += ' ' + option['label'] product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', option_name)) product_loader.add_value('url', response.url) product_loader.add_value('brand', product_brand) product_loader.add_value('image_url', product_image) if identifier: yield Request('http://www.bedworld.net/oi/ajax/co/?id=' + identifier + '&pid=' + product_identifier, meta={'item': product_loader.load_item()}, callback=self.parse_options) else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', product_name)) product_loader.add_value('brand', product_brand) product_loader.add_value('identifier', product_identifier) product_loader.add_value('url', response.url) product_loader.add_value('image_url', product_image) price = response.xpath('//span[@id="product-price-' + product_identifier + '"]//text()').re(r'([\d.,]+)') price = price[0] if price else 0 product_loader.add_value('price', price) option_elements = [] dropdown_elements = response.xpath('//select[contains(@class, "product-custom-options")]') for dropdown_options in dropdown_elements: options = [] for dropdown_option in dropdown_options.select('option[@value!=""]'): option = {} option['identifier'] = dropdown_option.select('@value').extract()[0] option['desc'] = dropdown_option.select('.//text()').extract()[0].split('+')[0] option['price'] = dropdown_option.select('@price').extract()[0] options.append(option) option_elements.append(options) final_options = [] if option_elements: combined_options = list(itertools.product(*option_elements)) for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get('desc', '') + option['desc'] final_option['price'] = final_option.get('price', Decimal(0)) + extract_price(option['price']) final_option['identifier'] = final_option.get('identifier', '') + '-' + option['identifier'] final_options.append(final_option) if final_options: for opt in final_options: opt_product = product_loader.load_item() opt_product['name'] += ' ' + normalize_space(opt['desc']) opt_product['price'] += opt['price'] opt_product['identifier'] += opt['identifier'] price = Decimal(opt_product['price']) net_price = price / Decimal('1.2') meta_ = Meta() meta_['net_price'] = str(net_price) opt_product['metadata'] = meta_ yield opt_product else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p
def parse_product(self, response): base_url = get_base_url(response) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() try: product_identifier = response.xpath( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = response.xpath( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)') if not product_identifier: yield Request(response.url, callback=self.parse_product, dont_filter=True) return else: product_identifier = product_identifier[0] product_name = response.xpath( '//h2[@itemprop="name"]/text()').extract()[0] brand = response.meta.get('brand', '') category = 'Used Equipment' sku = response.xpath('//div[@class="quickfind"]/text()').extract() sku = sku[0].replace('Quick find', '').strip() if sku else '' price = response.xpath( '//*[@id="product-price-{}"]/div/span[@class="price"]/text()'. format(product_identifier)).extract()[0] price_pennies = response.xpath( '//*[@id="product-price-{}"]/div/span[@class="price"]/span[@class="price-pennies"]/text()' .format(product_identifier)).extract() if price_pennies: price += price_pennies[0] price = extract_price(price) cashback = response.xpath('//div[@class="cashback"]/text()').extract() if cashback: price += extract_price(cashback[0]) options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr_id, attr in product_data['attributes'].iteritems(): for option in attr['options']: option_price = extract_price(option['price']) for product in option['products']: products[product] = ' '.join( (products.get(product, ''), option['label'])) prices[product] = option_price for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + ' ' + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('price', price + prices[identifier]) product = product_loader.load_item() yield product else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) warning = ''.join( hxs.select( '//div[@class="InfoBanner" and contains(text(), "has returned 0 results")]//text()' ).extract()) if not warning: warning = ''.join( hxs.select( '//div[@class="noSearchResultsFound"]/text()').extract()) if warning: self.log(warning) return many = hxs.select( '//div[@id="SearchResults"]//div[@class="categoryGridTitle"]/a/@href' ).extract() if many: for url in many: yield Request(urljoin(get_base_url(response), url), callback=self.parse_product) return if hxs.select( '//div[@class="color:red" and contains(text(), "this item is no longer available")]' ): self.log('Item not available [%s]' % (response.url)) return loader = ProductLoader(item=Product(), selector=hxs) comms_no = hxs.select( '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ).extract()[0].upper() loader.add_value('identifier', comms_no) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@id="productTitle"]//text()') loader.add_xpath( 'price', '//div[@id="productMainPrice"]/span[@id="price"]/text()') loader.add_xpath( 'sku', '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ) category = hxs.select( '//div[@class="newbreadcrumbText"]//text()').extract()[1:-1] loader.add_value('category', category) img = hxs.select('//span[@id="mainImage"]/a/img/@src').extract() if len(img[0]) < 255: loader.add_value('image_url', urljoin(get_base_url(response), img[0])) else: loader.add_value('image_url', '') loader.add_xpath('brand', '//div[@id="supplierLogo"]/img/@title') if not loader.get_output_value('brand'): loader.add_value('brand', loader.get_output_value('name').split()[0]) if loader.get_output_value('price') < 20: loader.add_value('shipping_cost', '2.95') else: loader.add_value('shipping_cost', '0') in_stock = 'IN STOCK' in ''.join( hxs.select( '//div[@id="stockCheck"]/div/text()').extract()).upper() if in_stock: loader.add_value('stock', '1') else: loader.add_value('stock', '0') manufacturers_no = hxs.select( '//span[@id="manufactNo"]/text()').extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Manufacturer No:")]]/td[not(@class)]/text()' ).extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Manufacturer No:")]]/td[2]//text()' ).extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ).extract() manufacturers_no = manufacturers_no[0].strip() m = sku_regex.search(manufacturers_no) if m: manufacturers_no = m.group(1) product = loader.load_item() product['metadata'] = {'manufacturers_no': manufacturers_no} self.yield_item_with_metadata(product) return
def parse(self, response): products = response.xpath( '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]' ) for product in products: winter_tyre = product.xpath( '@data-filter-season').extract()[0] == 'Winter' if not winter_tyre: name = product.xpath( './/div[contains(@class, "tyre-model text-center")]/text()' ).extract()[0] brand = product.xpath('@data-filter-brand').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', brand + ' ' + name) loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) identifier = product.xpath('@data-tyreid').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) image_url = product.xpath( './/div[contains(@class, "tyre-image")]//img/@src' ).extract() if image_url: loader.add_value( 'image_url', urljoin(get_base_url(response), image_url[0])) price = product.xpath( './/div[contains(@class, "tyre-pricing-information")]/div/text()' ).re(r'[\d,.]+') price = price[0] if price else '0.00' loader.add_value('price', price) tyresize_text = product.xpath( './/div[contains(@class, "tyre-size")]/text()').extract( )[0].strip() try: width, aspect, speed_rating, rim, load_rating = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text, re.I).groups() except: width, aspect, speed_rating, rim = re.search( r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups() load_rating = '' fitting_method = 'Fitted' metadata = MicheldeverMeta() metadata['aspect_ratio'] = aspect metadata['rim'] = rim metadata['speed_rating'] = speed_rating metadata['width'] = width metadata['fitting_method'] = fitting_method metadata['load_rating'] = load_rating metadata['alternative_speed_rating'] = '' xl = product.xpath( '@data-filter-reinforced').extract()[0] == 'Y' metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat(loader.get_output_value('name')) run_flat = product.xpath( '@data-filter-runflat').extract()[0] == 'Y' metadata[ 'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No' manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\ .re(r'Homologated for fitment to certain (.*) cars\.') metadata['manufacturer_mark'] = find_man_mark( manufacturer_mark[0]) if manufacturer_mark else '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr' '|@data-filter-tyreefficiencyg' '|@data-filter-tyreefficiencyd')\ .extract() metadata['fuel'] = fuel metadata['grip'] = grip metadata['noise'] = noise product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): continue product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product