def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-right"]//div[@class="pp-name"]/h1/text()').extract()[0].strip() main_price = hxs.select(u'//div[@class="product-right"]//div[@class="pp-price"]/span/span/text()').extract()[0] product_options = hxs.select(u'//select[@class="ekm-productoptions-dropdown-option"]') if product_options: body = response.body.replace('\xc2', ' ') if product_options.select(u'../select[@onchange]'): set_option_price = True for option in product_options.select(u'./option'): name_with_option = name + u' %s' % option.select(u'./text()').extract()[0].strip() option_value = option.select(u'./@value').extract()[0] price = re.search('== \'%s\'.*?_EKM_PRODUCTPRICE.*?= \'([\d\.]+?)\'' % option_value, body, re.DOTALL).groups()[0]\ if set_option_price else main_price loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name_with_option) loader.add_value('price', price) loader.add_value('url', response.url) if loader.get_output_value('price'): yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', main_price) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0] base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract() if not base_price: base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract() base_price = base_price[0] product_options = hxs.select(u'//ul[@class="options-list"]/li') if product_options: for option in product_options: loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) name_with_option = name + u" %s" % option.select(u'./span[@class="label"]/label/text()').extract()[0] loader.add_value("name", name_with_option) extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract() if extra_price: extra_price = extra_price[0].replace(u"\xa3", u"") base_price = base_price.replace(u"\xa3", u"") loader.add_value( "price", Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal("0.00")) ) if loader.get_output_value("price"): yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) loader.add_value("name", name) loader.add_value("price", base_price) if loader.get_output_value("price"): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h2/text()') product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()') product_loader.add_xpath('sku', u'//span[@class="VariationProductSKU"]/text()') product_loader.add_xpath('category', u'//div[@id="ProductBreadcrumb"]/ul/ul/li[2]/a/text()') product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]/a/img/@src') product_loader.add_xpath('brand', u'//div[@class="Value"]/a/text()') product_loader.add_value('shipping_cost', '') options = hxs.select(u'//div[@class="DetailRow"]//ul/li/label/input/../..') if options: product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0] product_orig = product_loader.load_item() for opt in options: name = opt.select(u'.//input/../text()[2]').extract() if not name: name = opt.select(u'concat(.//input/../span[1]/text(),.//input/../span[2]/text())').extract() var = opt.select(u'.//input/@value').extract() product = Product(product_orig) product['name'] = (product['name'] + ' ' + name[0].strip()).strip() yield Request('http://www.midwestunlimited.com/remote.php' + '?w=GetVariationOptions&productId=' + product_id + '&options=' + var[0], meta={'product': product}, callback=self.parse_price) else: yield product_loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'div/h3/a/@href') if product.select('div/h3/a/abbr/@title'): loader.add_xpath('name', 'div/h3/a/abbr/@title') else: loader.add_xpath('name','div/h3/a/text()') price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item() else: products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'td/h3/a/@href') loader.add_xpath('name', 'td/h3/a/text()') if product.select('td/p/strong/text()').extract(): price = product.select('td/p/strong/text()').extract()[0] else: if product.select('td/div/p/strong/text()').extract(): price = product.select('td/div/p/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) prod_name = hxs.select('//div[@id="productSpecification"]/div/table/tr[1]/td[2]/text()').extract() if prod_name: mpn = ''.join(hxs.select('//*[@id="productSpecification"]/div/table/tr[td/text()="Manufacturer Code"]/td[@class="productAttributeValue"]/text()').extract()) url = response.url url = urljoin_rfc(get_base_url(response), url) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) #if not mpn in prod_name[0]: # loader.add_value('name', ' '.join((prod_name[0], mpn))) #else: # loader.add_value('name', prod_name[0]) loader.add_value('name', prod_name[0]) sku = hxs.select('//div[@id="productSpecification"]/div/table/tr[2]/td[2]/text()').extract() if sku: loader.add_value('sku', sku[0]) loader.add_value('identifier', sku[0]) price = ''.join(hxs.select('//div[@id="productAddToCart"]/div/b/text()').extract()) if price: loader.add_value('price', price) yield loader.load_item() else: # several productSpecification prods = hxs.select('//div[@class="productInformation"]') for prod in prods: mpn = ''.join([code for code in prod.select('p/text()').extract() if 'Manufacturer Code' in code]).strip().split(':')[-1] url = prod.select('./a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) if url: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) name = prod.select('./a/text()').extract() if name: if not mpn in name[0]: loader.add_value('name', ' '.join((name[0], mpn))) else: loader.add_value('name', name[0]) #loader.add_value('name', name[0]) sku = prod.select('./p[1]').extract() if sku: match = re.search('(\d+)', sku[0]) sku = match.group(1) loader.add_value('sku', sku) loader.add_value('identifier', sku) price = ''.join(prod.select('./p/b/text()').extract()).split('(')[0] if price: loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = hxs.select(u'//table[@class="ropetable" or @class="dbitable"]//td/a/@href').extract() if not options: options = hxs.select(u'//table//a/@href').extract() options = [o for o in options if o.startswith(response.url.rstrip('/'))] if options: for url in options: yield Request(url, meta=response.meta, callback=self.parse_product) return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()') product_loader.add_xpath('sku', u'//div[@id="sku"]/text()') product_loader.add_value('category', response.meta['category']) product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]//img/@src') product_loader.add_xpath('brand', u'//div[@class="DetailRow"]/div/a/text()') product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()') options = hxs.select(u'//div[@class="productAttributeList"]//ul/li/label/input/../../..') options2 = hxs.select(u'//div[@class="productAttributeList"]//select') # FIXME http://www.ropeandrescue.com/conterra-tac-longbow-ranger-rescue-pack/ # checkbox support? product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0] product_orig = product_loader.load_item() if options: for opt in options: names = opt.select(u'.//input/../span/text()').extract() values = opt.select(u'.//input/@value').extract() value_names = opt.select(u'.//input/@name').extract() for i in xrange(len(names)): product = Product(product_orig) product['name'] = (product['name'] + ' ' + names[i].strip()).strip() yield Request('http://www.ropeandrescue.com/remote.php' + '?w=getProductAttributeDetails&product_id=' + product_id + '&' + urllib.quote(value_names[i]) + '=' + values[i], meta={'product': product}, callback=self.parse_price) elif options2: names = options2.select(u'./option[@value!=""]/text()').extract() values = options2.select(u'./option[@value!=""]/@value').extract() value_name = options2.select(u'./@name').extract()[0] for i in xrange(len(names)): product = Product(product_orig) product['name'] = (product['name'] + ' ' + names[i].strip()).strip() yield Request('http://www.ropeandrescue.com/remote.php' + '?w=getProductAttributeDetails&product_id=' + product_id + '&' + urllib.quote(value_name) + '=' + values[i], meta={'product': product}, callback=self.parse_price) else: yield product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) search_sku = response.meta["sku"] main_name = hxs.select('//span[@id="mainProductName"]/text()').extract() main_price = hxs.select("//dd[@class='price']/text()").extract() if not main_name and not main_price: return main_name = main_name[0].strip() main_price = main_price[0].strip() dec = hxs.select("//dd[@class='price']/span/text()").extract() if dec: main_price += dec[0] skus = [] sku_text = hxs.select("//strong[text()='Mfg Part Number(s):']/../text()").extract() if sku_text: skus += [sku.strip() for sku in sku_text[0].split(", ")] sub_products = hxs.select('//select[@id="skuIdSelection"]/option') if sub_products: for p in sub_products: p_parts = p.select(".//text()").extract()[0].split("-") if p_parts[-1].strip().startswith("$"): price = p_parts[-1].strip() else: price = main_price sku_id = p.select(".//@value").extract()[0] sub_product_node = hxs.select('//input[@name="skuId" and @value="%s"]/../div' % sku_id) sku = None if sub_product_node: sku = self._get_sku(sub_product_node.select(".//text()").extract()[0], skus) loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) loader.add_value("name", main_name + " " + "".join(p_parts[:-1]).strip()) loader.add_value("price", price) if sku: loader.add_value("sku", search_sku) if sku in search_sku: yield loader.load_item() else: loader = ProductLoader(item=Product(), response=response) loader.add_value("url", response.url) loader.add_value("name", main_name) loader.add_value("price", main_price) if skus: loader.add_value("sku", search_sku) if any([sku in search_sku for sku in skus]): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse(self, response): BASE_URL = 'http://www.virginmobile.com/vm/' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="webapp_shophome_3col_spotlight"]') for product in products: loader = ProductLoader(item=Product(), selector=product) xpath = 'div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) else: xpath = 'div/div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()' if product.select(xpath): loader.add_xpath('name', xpath) loader.add_xpath('price', 'div/div/div/div/div/div/div/p/span/text()') relative_url = product.select('div/div/div/div/div/div/p/a/@href') if relative_url: url = urljoin_rfc(BASE_URL, relative_url.extract()[0], response.encoding) loader.add_value('url', url) yield loader.load_item()
def parse_page(self, response): base_url = get_base_url(response) base_url_func = functools.partial(urljoin_rfc, base_url) hxs = HtmlXPathSelector(response) # next page if self.products_nextpage_xpath: url = hxs.select(self.products_nextpage_xpath).extract() if url: yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page) # products i = 0 if self.products_xpath: for z in hxs.select(self.products_xpath)[1:]: #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract() loader = ProductLoader(selector=z, item=Product()) loader.add_xpath('price', ".//div[@class='storeitem_price']/span[@class='storeitem_firstprice']/text()", comas2dots) loader.add_xpath('identifier', "./div/a/@href", first, re="\-(\d+)\.html") loader.add_xpath('sku', "./div/a/@href", first, re="\-(\d+)\.html") loader.add_xpath('url', "./div/a/@href", first, base_url_func) #loader.add_xpath('url', "./div[@class='storeitem_title store_bolded']/a/@href", first, base_url_func) loader.add_xpath('name', "./div/a/b/text()") #loader.add_xpath('name', "./div[@class='storeitem_title store_bolded']/a/text()") loader.add_xpath('name', "./div/a/text()") yield loader.load_item() i += 1 if i != 30: log.msg("Less than 30 products in %s %s" % (response.url, i))
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='box-caracteristic-search']/div[@class='table-wrap']/form/table/tbody/tr") for product in products: name = product.select("td[@class='prd-details']/h3/a/text()").extract() if not name: logging.error("ERROR! No name! %s" % response.url) continue name = name[0] url = product.select("td[@class='prd-details']/h3/a/@href").extract() if not url: logging.error("ERROR! NOT FOUND URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] url = self._urljoin(response, url) price = product.select("td[@class='prd-amount-details']/div/p[@class='prd-amount']/strong/text()").extract() if not price: logging.error("ERROR! NOT FOUND PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href') if nextPageLink: yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products) products = hxs.select('//div[@id="center-main"]//div[@class="details"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", "a/text()") loader.add_xpath("sku", 'div[@class="sku"]/span/text()') # few prices were under div class desc price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()') if price_selector: price = price_selector[0].extract() else: price = "0.0" loader.add_value("price", price) relative_url = product.select("a/@href")[0].extract() loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url)) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//dl[@class="search_result"]') for product in products: url = product.select(u'./dt/a[@class="#"]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) multiple_prices = product.select(u'.//dd[@class="prices"]') name = product.select(u'./dt/a[@class="#"]/text()').extract()[0].strip() + u' %s' for option in multiple_prices: price_xpath = u'.//td[%s]/following-sibling::td[1]/text()' if option.select(u'.//td[@class="sale"]'): price_xpath %= u'@class="sale" and %s' price_xpath %= u'contains(text(),"%s")' product_types = [u'Item', u'Bottle', u'Case'] for product_type in product_types: loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) price = option.select(price_xpath % product_type) if price: loader.add_value('name', name % product_type) loader.add_value('price', price.extract()) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip() multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option') if multiple_options and not u'requested' in response.meta: for option in multiple_options: formname = u'aspNetForm' formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0], u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList', u'__EVENTARGUMENT' : u''} req = FormRequest.from_response(response, formname=formname, formdata=formdata, meta={u'requested': True}, dont_click=True, callback=self.parse_product) yield req if multiple_options: name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # getting product details from product list prod_names = hxs.select('//h4/a/@title').extract() prod_urls = hxs.select('//h4/a/@href').extract() prices = hxs.select('//td[@class="ProductPrice"]/h4/text()').extract() prices = [p.strip().strip(u'\xa3') for p in prices] names_urls_prices = zip(prod_names, prod_urls, prices) for name, url, price in names_urls_prices: url = urljoin_rfc(get_base_url(response), url) if url: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item() # pages next_page = hxs.select('//a[@class="NextPage"]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url)
def parse_page(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="item"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'h2/a/text()') relative_url = product.select('h2/a/@href').extract()[0] url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) loader.add_value('url', url) loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()') yield loader.load_item() next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract() if not next_page: relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract() for relative_url in relative_urls: url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) yield Request(url, callback=self.parse_page) else: next_url = next_page[-1] if self._is_next(next_url): url = urljoin_rfc('http://www.dolphinmusic.co.uk/', next_url, response.encoding) yield Request(url, callback=self.parse_page)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@class="item"]/span/text()').extract() if name: url = response.url url = urljoin_rfc(get_base_url(response), url) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) loader.add_value('name', name[0]) items = hxs.select('//div[@class="sku-details"]') for item in items: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', url) #loader.add_value('name', name[0]) n = name[0].strip() sku = ''.join(item.select('.//span[@class="sku-description"]//text()').extract()) if sku: n += ' ' + sku.strip() loader.add_value('name', n) price = item.select('./span[@class="price"]/text()').extract() if price: loader.add_value('price', price[0]) else: price = item.select('./span[@class="special-price"]/text()').extract() loader.add_value('price', price[0]) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()') loader.add_value("url", response.url) loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()') loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) opt_groups = [] def fix_options(o): try: return (o[0], o[1].replace(',', '')) except: return (o[0], '0') for option in hxs.select(u'//div[@class="input-box"]//select'): opt_list = option.select(u'./option[position() != 1]/text()').extract() opt_list = [o.replace('+$', '$').split('$') for o in opt_list] opt_groups.append([fix_options(o) for o in opt_list]) for opt_name, opt_price in multiply(opt_groups): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h2[@class="title"]/text()') product_loader.add_xpath('price', u'//span[contains(@class,"sale-price")]/text()') product_loader.add_xpath('sku', u'substring-after(//span[contains(@class,"meta-sku")]/text(),":")') product_loader.add_xpath('category', u'//ul[@class="breadcrumb"]/li[2]/a/@title') product_loader.add_xpath('image_url', u'//div[@class="teaser-large"]/img/@src') product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")') product_loader.add_value('shipping_cost', '') product = product_loader.load_item() product['name'] = (product['name'] + ' ' + opt_name).strip() product['price'] = product['price'] + Decimal(opt_price) yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) if not self.brand_crawled: brands = hxs.select('//*[@class="infoBox-categories"]//a/@href').extract() for url in brands: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) self.brand_crawled = True # Is it another subcategory page? sub_sub_categories = hxs.select('//div[@id="catView"]//a/@href').extract() for url in sub_sub_categories: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) # Is it products page? products = hxs.select('//div[@id="productView"]/ul/li[@class="product"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h2/a/text()') loader.add_xpath('price', './/h3/a/text()') loader.add_xpath('url', './/h2/a/@href') yield loader.load_item()
def parse_page(self, response): base_url = get_base_url(response) base_url_func = functools.partial(urljoin_rfc, base_url) hxs = HtmlXPathSelector(response) # products next page if self.products_nextpage_xpath: if not self.products_nextpage_re: url = hxs.select(self.products_nextpage_xpath).extract() else: url = hxs.select(self.products_nextpage_xpath).re( self.products_nextpage_re) if url: yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page) # products if self.products_xpath: for z in hxs.select(self.products_xpath): loader = ProductLoader(selector=z, item=Product()) if self.product_name: for xpath in self.product_name: loader.add_xpath('name', xpath) #loader.add_xpath('name', "./div[@class='margue']/text()") if self.product_url: for xpath in self.product_url: loader.add_xpath('url', xpath, first, base_url_func) if self.product_price: for xpath in self.product_price: loader.add_xpath('price', xpath, comas2dots) yield loader.load_item()
def parse_categories(self, response): hxs = HtmlXPathSelector(response) sub_categories = hxs.select('//div[@class="section_190"]/a/@href').extract() if not sub_categories: products = hxs.select('//div[@class="list_search_result"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div[@class="list_search_detail"]/' 'div[@class="list_search_info"]/p/a/' 'span/text()') loader.add_xpath('url', 'div[@class="list_search_detail"]/' 'div[@class="list_search_info"]/p/a/@href') loader.add_xpath('price', 'div[@class="list_search_detail"]/' 'div[@class="list_search_actionblock"]/' 'p/span[@class="list_search_price"]/text()') yield loader.load_item() next_page = hxs.select('//div[@class="formfloatright"]/' 'strong/a[text()="Next>"]/@href').extract() if next_page: next_url = next_page[-1] yield Request(next_url, callback=self.parse_categories) else: urls = hxs.select('//div[@class="section_190"]/a/@href').extract() for url in urls: yield Request(url, callback=self.parse_categories)
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//table[@class="buybox"]') if not product: return loader = ProductLoader(item=Product(), selector=product) name = product.select('.//h1[@class="stylename"]/text()').extract() if name: log.msg(name[0].lower() + ' - ' + response.meta['name'].lower().replace('+', ' ')) product_words = name[0].lower().strip().split(' ') search_words = response.meta['name'].lower().replace('+', ' ').split(' ') diff = [w for w in search_words if not w in product_words] #if name[0].lower() == response.meta['name'].lower().replace('+', ' '): if not diff: price = "".join(product.select('.//span[@class="price"]/span/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name[0]) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[@class="productCellWrapper"]') if not products: return for product in products: loader = ProductLoader(item=Product(), selector=product) brand = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="brand"]/text()').extract()).strip() style = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName color"]/text()').extract()).strip() name = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName name"]/text()').extract()).strip() name = brand + ' ' + name + ' ' + style product_words = name.lower().split(' ') search_words = response.meta['name'].lower().split() diff = [w for w in search_words if not w in product_words] if not diff: url = product.select('.//div[@class="productBrandTitleColor"]/a/@href').extract()[0] price = "".join(product.select('.//div[@class="price"]/span[@class="salePrice"]/text()').re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join(product.select('.//div[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip() loader.add_value('name', name) loader.add_value('url', urljoin_rfc(base_url,url)) loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower(): yield loader.load_item() break """
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//li[@class="item" or @class="item lastItem"]') for product in products: name = product.select('div/h3/a/span/text()').extract()[0] url = product.select('div/h3/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) options_from = ''.join(product.select('div/p[@class="price money"]/span/abbr/text()').extract()).strip() options_now = ''.join(product.select('div/p[@class="price money"]/text()').extract()).strip() if ('From' in options_from) or ('Now' in options_now): yield Request(url, callback=self.parse_options, meta={'name':name}) else: loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('url', url) price = product.select('div/p[@class="price money"]/span/span/text()').extract() if not price: price = product.select('div/p[@class="price money"]/ins/span/text()').extract() if not price: price = [''] loader.add_value('price', price[0]) yield loader.load_item() next = hxs.select('//a[@rel="nofollow" and span/text()="Next \xc2\xbb"]/@href'.decode('utf')).extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products) else: sub_categories = hxs.select('//*[@id="categoryNavigation"]/li/ul/li/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[@class="productDetailHeader"]/text()') if hxs.select(u'//span[@class="productDetailSelling"]/text()'): product_loader.add_xpath('price', u'//span[@class="productDetailSelling"]/text()') else: product_loader.add_value('price', '') product_loader.add_xpath('sku', u'//input[@type="hidden" and (@name="hidProductId" or @name="inv")]/@value') product_loader.add_xpath('category', u'//td[@class="smallPrint"]/a[position()=2 and contains(text(),"Products")]/../a[3]/text()') img = hxs.select(u'//a[@class="smallPrint" and @rel="lightbox"]/@href').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) if hxs.select(u'//a[contains(@href,"BrandName")]/@href'): product_loader.add_xpath('brand', u'substring-after(//a[contains(@href,"BrandName")]/@href,"=")') else: brands = hxs.select(u'//strong[@class="sideBarText"]/text()').extract() brands = [b.strip() for b in brands] for brand in brands: if product_loader.get_output_value('name').startswith(brand): product_loader.add_value('brand', brand) break else: product_loader.add_xpath('brand', u'normalize-space(substring-before(substring-after(//title/text(), " - "), " - "))') # product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()') yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//ol[@id="products-list" and @class="products-list"]//li[contains(@class,"item")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//h2[@class="product-name"]/a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) sku = product.select(u'.//small[child::b[contains(text(),"Product Code:")]]/text()').extract() if sku: sku = sku[0].strip()[3:] if sku in self.skus: product_loader.add_value('sku', sku) name = product.select(u'.//h2[@class="product-name"]/a/text()').extract()[0].strip() pack_size = product.select(u'.//small[child::b[contains(text(),"Pack Size:")]]/text()').extract() if pack_size: name += u' x' + pack_size[0].strip() + u'u.' product_loader.add_value('name', name) price = product.select(u'.//div[@class="price-box"]/span[contains(@class,"regular-price")]/span[@class="price"]/text()').re(u'[\d\.,]+') price = re.sub(',', '', price[0]) product_loader.add_value('price', price) yield product_loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]') for p in products: loader = ProductLoader(item=Product(), selector=p) name = p.select('.//p[@class="para1"]//text()').extract() name = ' '.join([n.strip() for n in name]) name = re.sub(' +', ' ', name) loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href') loader.add_value('name', name) loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)') loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)') if not loader.get_output_value('price'): yield Request(loader.get_output_value('url'), callback=self.parse_products2) continue if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\ and response.meta.get('ret', 0) < 3: yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1}) return yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div/div/table') for product in products: name = ''.join(product.select('tr/td/div[@class="featuredProductLinks"]/a/text()').extract()) if name: loader = ProductLoader(item=Product(), selector=product) brand = ''.join(product.select('tr/td/div[@class="featuredMIS"]/a/text()').extract()) loader.add_value('name', ' '.join((brand, name))) relative_url = product.select('tr/td/div[@class="featuredProductLinks"]/a/@href').extract() loader.add_value('url', urljoin_rfc(get_base_url(response), relative_url[0])) price = ''.join(product.select('tr/td/div/div' '[@class="featuredProductPrice"]' '/span/span[@class="SalePrice1"]' '/text()').extract()).replace('.','').replace(',','.') if not price: price = ''.join(product.select('tr/td/div/div' '[@class="featuredProductPrice"]' '/span/span[@class="variantprice1"]' '/text()').extract()).replace('.','').replace(',','.') loader.add_value('price', price) yield loader.load_item() next = hxs.select('//div[@class="pagingdiv"]/a[not(@class)]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[-1]) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select( '//form[@id="productDetailsAddToCartForm"]/input[@name="product_id"]/@value' ).extract() if tmp: loader.add_value('identifier', tmp[0]) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) name = hxs.select( '//div[@id="ProductDetails"]/div/h1/text()').extract() if name: loader.add_value('name', name[0].strip()) else: log.msg('### No name at ' + response.url, level=log.INFO) # price price = hxs.select( '//span[@class="ProductDetailsPriceIncTax"]/text()').extract() if price: price = extract_price(price[0].split()[0]) loader.add_value('price', price) else: loader.add_value('price', 0) # image_url image_url = hxs.select( '//div[@class="ProductThumbImage"]//img[1]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) # get brand for brand in self.brands: # if brand in name: if name and name[0].startswith(brand): loader.add_value('brand', brand) break # category tmp = hxs.select( '//div[@id="ProductBreadcrumb"]/ul/li/a/text()').extract() if len(tmp) > 1: loader.add_value('category', tmp[1]) # shipping_cost # stock tmp = hxs.select( '//div[@class="ProductPriceWrap"]//em[text()="Call for pricing"]') if tmp: loader.add_value('stock', 0) else: loader.add_value('stock', 1) # process options product = loader.load_item() options = self.get_options(response) # print '###', options if options[0][0]: identifier = product['identifier'] name = product['name'] for option in options: # print '###',option item = copy.deepcopy(product) item['identifier'] = identifier + option[0] item['name'] = name + ' ' + option[1] option[2].update({'w': 'getProductAttributeDetails'}) r = FormRequest.from_response(response, formnumber=1, formdata=option[2], meta={'item': item}, callback=self.parse_option) yield r.replace(url='http://sxpro.co.uk/remote.php') else: yield product
def parse_product(self, response): #inspect_response(response, self) #return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select('//span[@itemprop="identifier"]/text()').extract() if tmp: loader.add_value('identifier', tmp[0].strip()) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) return #tmp = hxs.select('//input[@name="productId"]/@value').extract() #if tmp: # loader.add_value('sku', tmp[0]) name = '' tmp = hxs.select('//span[@itemprop="name"]/h1/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) else: log.msg('### No name at ' + response.url, level=log.INFO) #price price = 0 tmp = hxs.select('//span[@itemprop="price"]/text()').extract() if tmp: price = extract_price(tmp[0].strip().replace(',', '')) loader.add_value('price', price) #stock stock = 0 tmp = hxs.select('//td[strong="In Stock: "]/text()').extract() if tmp and 'yes' in ''.join(tmp).lower(): stock = 1 tmp = hxs.select( '//td[span/@itemprop="identifier"]/preceding-sibling::td/text()' ).extract() if tmp and 'availability' in ''.join(tmp).lower(): stock = 1 loader.add_value('stock', stock) #image_url tmp = hxs.select('//img[@itemprop="image"]/@src').extract() if not tmp: tmp = hxs.select('//td[@width="350"]/img/@src').extract() if tmp: url = urljoin(response.url, tmp[0]) loader.add_value('image_url', url) #brand loader.add_value('brand', 'Le Creuset') #category loader.add_value('category', 'Le Creuset') #shipping_cost if price < 20: loader.add_value('shipping_cost', 2.49) elif price < 50: loader.add_value('shipping_cost', 5.95) #promotional promotional = [] tmp = hxs.select( '//td[strong/font//span/@itemprop="price"]/text()').extract() if tmp: txt = ''.join(tmp) r = re.findall(r'\(Save - \d+%\)', txt) if r: promotional.append(r[0]) tmp = hxs.select('//td[@bgcolor="#C00000"]').extract() if tmp: txt = '\n'.join([ lxml.html.fromstring(s.replace('<br>', '\n').strip()).text_content() for s in tmp if len(s.strip()) > 0 ]) promotional.append(txt.strip()) features = '' tmp = hxs.select( '//td[strong="Features:"]/span[@itemprop="description"]').extract( ) if tmp: features = '\n'.join([ lxml.html.fromstring(s.replace('<br>', '\n').strip()).text_content() for s in tmp if len(s.strip()) > 0 ]) loader.add_value('metadata', { 'promotional': promotional, 'features': features }) product = loader.load_item() options = None #No options currently. if not options: if not product.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO) return #process options #No options currently. for sel in options[0:1]: ### item = copy.deepcopy(product) tmp = sel.select('.//label/input/@value').extract() if tmp: item['identifier'] += '-' + tmp[0] item['name'] = name + ' - ' + tmp[0] if not item.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//tr[@class="mainprodboxtitle"]/td/h1/text()').extract() loader.add_value('name', name) loader.add_value('url', response.url) stock = hxs.select( '//div[@class="mainprodbox"]//input[@type="image" and @alt="Buy"]') if not stock: loader.add_value('stock', 0) price = hxs.select( '//div[@class="mainprodbox"]//span[@class="buyprice"]/text()' ).extract() if not price: price = hxs.select( '//table[@class="buybox"]//td[@align="right" and @class="buyboxlightblue"]/text()' ).extract() if price: loader.add_value('price', price) else: loader.add_value('price', '0.00') price = loader.get_output_value('price') if Decimal(price or '0.00') < Decimal('50.00'): loader.add_value('shipping_cost', '4.95') brand = hxs.select('//img[contains(@src,"logo")]/@alt').extract() loader.add_value('brand', brand) categories = hxs.select( '//tr[@class="mainprodboxtitle"]/td/a/text()').extract() for category in categories: loader.add_value('category', category) sku = hxs.select( '//div[@class="mainprodbox"]//td[@class="text_small"]/text()').re( 'Product Code: (.*)') loader.add_value('sku', sku) identifier = hxs.select( '//div[@class="mainprodbox"]//input[@type="hidden" and @name="sn"]/@value' ).extract() if not identifier: identifier = re.search('/sn/(.*)', response.url) identifier = identifier.group(1) if identifier else None else: identifier = identifier[0] loader.add_value('identifier', identifier) image_url = hxs.select( '//div[@class="mainprodbox"]//a[contains(@href,"popup")]/img/@src' ).extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) self.crawled_ids.append(identifier) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand')) categories = hxs.select( '//div[@id="breadcrumbs"]/div[@class="crumbs"]/span/a/span/text()' ).extract() for category in categories[2:]: loader.add_value('category', category) sku = hxs.select('//meta[@itemprop="sku"]/@content').extract() loader.add_value('sku', sku) image_url = hxs.select( '//div[@id="product-image"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) identifier = loader.get_output_value('name') loader.add_value('shipping_cost', '0.00') item = loader.load_item() variants = response.xpath('//div[@class="variant"]') if variants: for variant in variants: options = variant.select('.//tr') variant_name = variant.select( './/div[@class="title"]/h4/text()')[0].extract().strip() for option in options: option_name = option.select('.//td[@class="name"]/text()')[ 0].extract().strip().encode('latin-1') option_item = deepcopy(item) option_item['identifier'] = '{}-{}-{}'.format( identifier, variant_name, option_name).decode('latin-1') option_item['name'] += ' {} {}'.format( variant_name, option_name if option_name.lower() != variant_name.lower() else '').decode('latin-1') option_item['name'] = option_item['name'].strip() price = variant.xpath( './/span[@class="now"]/text()').extract_first( ) or variant.css('p.price span::text').extract_first() option_item['price'] = extract_price( price) if price else Decimal('0.00') if Decimal(option_item['price']) < Decimal('30.00'): option_item['shipping_cost'] = '1.99' stock = option.select('.//td[@class="stock instock"]') if not stock: option_item['stock'] = 0 option_item['image_url'] = variant.select( './/img/@src')[0].extract() yield option_item else: self.log('PRODUCT WITHOUT OPTIONS: ' + response.url)
def parse(self, response): hxs = HtmlXPathSelector(response) name = ' '.join( hxs.select('//div[contains(@class, "deviceTitle")]/text()'). extract()).strip() tariffs = hxs.select( '//form[@id="command" and div[contains(@class, "planName")]/text()!="Pay As You Go."]' ) for tariff in tariffs: loader = ProductLoader(selector=tariff, item=Product()) tariff_name = tariff.select( 'div[contains(@class, "planName")]/text()').extract()[0] monthly_cost = tariff.select( 'div//div[contains(@class, "priceColumn")]/div[contains(@class, "price")]/text()' ).extract()[0] duration = tariff.select( 'div//li[contains(text(), "months")]/text()').extract( )[0].split(u' ')[0].replace(u'\xa0months', '') product_code = tariff.select( 'input[@name="productCode"]/@value').extract()[0] tariff_code = tariff.select( 'input[@name="packageCode"]/@value').extract()[0] loader.add_value( 'identifier', product_code + '-' + tariff_code.replace('ContractD', '') + '-' + str(duration)) loader.add_value( 'name', response.meta['device_name'] + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', name.split()[0]) price = tariff.select( 'div//div[contains(@class, "upfrontPrice")]/span/text()' ).extract()[0] loader.add_value('price', price) image_url = hxs.select( '//div[@class="devicePicturePanel"]/div/a/img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) product = loader.load_item() metadata = TelecomsMeta() metadata['device_name'] = response.meta['device_name'] metadata['monthly_cost'] = monthly_cost.replace(u'\u00a3', '') metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = operator metadata['channel'] = channel metadata['network_generation'] = net_gen product['metadata'] = metadata yield product tariffs = hxs.select('//li[@class="visible"]') if tariffs: name = hxs.select( '//h1[@class="main-title section"]/text()').extract()[0] for tariff in tariffs: mem_size = tariff.select('@data-memory').extract()[0] colour = tariff.select('@data-colour').extract()[0] if mem_size in response.meta[ 'device_name'] and colour in response.url.replace( '_', ' '): loader = ProductLoader(selector=tariff, item=Product()) tariff_name = tariff.select('@data-planname').extract()[0] monthly_cost = tariff.select( '@data-monthly-cost').extract()[0] duration = tariff.select( 'div/div/p[contains(text(), "month contract")]/em/text()' ).extract()[0] tariff_code = re.search( 'ContractD(\w+)', tariff.select( 'div/div[@class="links"]/a[@class="chevron-link cta"]/@href' ).extract()[0]).group(1) loader.add_value('identifier', tariff_code + '-' + str(duration)) loader.add_value( 'name', response.meta['device_name'] + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', name.split()[0]) price = tariff.select('@data-upfront-cost').extract()[0] loader.add_value('price', price) image_url = hxs.select( '//a[contains(@class, "product-imag") and @data-colour="' + colour + '"]/img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) product = loader.load_item() metadata = TelecomsMeta() metadata['device_name'] = response.meta['device_name'] metadata['monthly_cost'] = monthly_cost metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = operator metadata['channel'] = channel metadata['network_generation'] = net_gen product['metadata'] = metadata yield product
def parse(self, response): if not isinstance(response, HtmlResponse): self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url)) return hxs = HtmlXPathSelector(response) # logic to find categories # find subcats for Outilage Jardin categories = hxs.select( '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() # find subcats for Aspirateurs categories += hxs.select( '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield self._proxyRequest(url) # products new logic products = hxs.select( u'//div[@id="productList"]//div[contains(@class,"plProductView")]') if products: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'url', './/a[contains(@class,"plPrName")]/@href') product_loader.add_xpath( 'name', './/a[contains(@class,"plPrName")]/text()') product_loader.add_xpath( 'category', '//div[@class="productListTitle"]/h1/text()') product_loader.add_xpath( 'image_url', './/div[contains(@class, "plProductImg")]//img/@data-src') product_loader.add_xpath('sku', './@data-sku') product_loader.add_xpath( 'identifier', './/input[contains(@name, "ProductPostedForm.ProductId")]/@value' ) price = product.select( u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()' ).extract() if price: decimals = product.select( u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()' ).re(u'(\d+)') if decimals: price = price[0] + '.' + decimals[0] product_loader.add_value('price', price) if product_loader.get_output_value( 'name') and product_loader.get_output_value('price'): yield product_loader.load_item() # pagination next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href' ).extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield self._proxyRequest(next_page)
def parse_list(self, response): # To list all products if they are not all already listed limiter_selected = response.xpath( '//div[@class="limiter"]/select/option[@selected]/@value').extract( ) limiter_all = response.xpath( '//div[@class="limiter"]/select/option[contains(@value, "limit=all")]/@value' ).extract() if limiter_all and limiter_selected: if limiter_selected[0] != limiter_all[0]: yield Request(response.urljoin(limiter_all[0]), callback=self.parse_list, meta=response.meta) sub_category_urls = response.xpath( '//div[@class="category-item-center"]' '//span[@class="product-name"]/a/@href').extract() for url in sub_category_urls: yield Request(response.urljoin(url), callback=self.parse_list, meta=response.meta) if not sub_category_urls: products = response.xpath( '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]' ) for product_xs in products: product_name = ''.join( product_xs.xpath( './/*[contains(@class, "product-name")]//text()'). extract()).strip() product_url = response.urljoin( product_xs.xpath( './/*[contains(@class, "product-name")]//a/@href'). extract()[0]) product_price = extract_price_eu( product_xs.xpath('.//*[@class="price-box"]//text()').re( r'[\d\.,]+')[-1]) product_image_url = map( response.urljoin, product_xs.xpath( './/*[contains(@class, "product-image")]//img/@src'). extract()) product_brand = response.meta.get('brand', '') product_category = map( unicode.strip, response.xpath( '//div[contains(@class, "breadcrumbs")]//li[contains(@class, ' '"category")]/a/text()').extract())[1:] product_out_of_stock = bool( product_xs.xpath( './/*[contains(@class, "availability") and contains(@class, "out-of-stock")]' )) product_shipping_cost = '0.00' if product_price >= self.free_shipping_over else '5.00' try: product_identifier = product_xs.xpath( './/*[contains(@id, "product-price-")]/@id').re( r'(\d+)')[0] except: product_identifier = None loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', product_url) loader.add_value('price', product_price) loader.add_value('shipping_cost', product_shipping_cost) loader.add_value('image_url', product_image_url) loader.add_value('brand', product_brand) loader.add_value('category', product_brand or product_category) if product_out_of_stock: loader.add_value('stock', 0) if product_identifier is not None: loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) yield loader.load_item() else: item = loader.load_item() yield Request(item['url'], meta={'item': item}, callback=self.parse_options)
def parse_item(self, response): meta = response.meta categories = response.css( '.ProductDetailBreadcrumbs-item::text').extract() sku = meta['client_product']['Item Number'] image_url = response.xpath( '//div[contains(@class, "main-carousel")]//a/@data-original-src' ).extract() if not image_url: image_url = response.xpath( '//img[contains(@class, "ProductDetailImagesBlock-carousel-image")]/@src' ).extract() prod_id = response.xpath('//input[@name="sku"]/@value').extract() prod_id = prod_id[0] if prod_id else '' try: name = response.xpath( '//h1/span[contains(@class, "ProductDetailInfoBlock-header-title")]/text()' ).extract()[0] except Exception: retry = meta.get('retry', 0) if retry <= 10: retry += 1 meta['retry'] = retry self.log('ERROR >>> No name found, retry URL: ' + response.url) yield Request(response.url, dont_filter=True, callback=self.parse_item, meta=meta) return else: self.log('ERROR >>> Gave up retrying URL: ' + response.url) return name += response.xpath('//h1/text()').extract()[-1].strip() brand = meta['client_product'].get('Brand', '') products_collected = [] sku_list = [] options = [] dropdown_options = response.xpath( '//select[contains(@class, "stdselect")]/option[@value!="XXXXXXXXXX"]' ) option_elements = [] if dropdown_options: for dropdown_option in dropdown_options: option = {} option['identifier'] = dropdown_option.xpath( '@value').extract()[0] option['sku'] = '' option['desc'] = dropdown_option.xpath( './/text()').extract()[0] cost = dropdown_option.xpath('@cost').extract() or re.findall( '\+\$([\d.]+)', option['desc']) option['cost'] = cost[0] if cost else '0' options.append(option) option_elements.append(options) else: dropdown_elements = response.xpath( '//div[@class="pdinfoblock"]/div[@class="fl"]//select') for dropdown_options in dropdown_elements: options = [] for dropdown_option in dropdown_options.xpath( 'option[@value!="XXXXXXXXXX"]'): option = {} option['identifier'] = dropdown_option.xpath( '@value').extract()[0] option['sku'] = '' option['desc'] = dropdown_option.xpath( './/text()').extract()[0].split('-')[0] option['cost'] = dropdown_option.xpath( '@cost').extract()[0] options.append(option) option_elements.append(options) image_options = response.css('.option_select_wrap .visual_option_wrap') if image_options: options = [] for image_option in image_options: option = {} option['identifier'] = image_option.xpath( '@data-pi-id').extract()[0] option['sku'] = '' option['desc'] = image_option.xpath('@data-name').extract()[0] option['cost'] = image_option.xpath('@data-cost').extract()[0] options.append(option) option_elements.append(options) if option_elements: if len(option_elements) > 1: combined_options = list(itertools.product(*option_elements)) options = [] for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' - ' + option['desc'] final_option['cost'] = final_option.get( 'cost', 0) + float(option['cost']) final_option['identifier'] = final_option.get( 'identifier', '') + ' - ' + option['identifier'] options.append(final_option) else: options = option_elements[0] products_matched = self.hhe_df[self.hhe_df['Wayfair'] == meta['client_product']['Wayfair']] for option in options: price = response.xpath( '//*[@class="dynamic_sku_price"]/span/text()').extract()[0] #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0] option_price_value = self.option_price(price, str(option['cost'])) # SKU not unique: match the correct client product sku if not products_matched.empty and products_matched.count( )['Wayfair'] > 1: current_diff = Decimal(0) current_sku = sku for i, row in products_matched.iterrows(): wf_price = Decimal(row['Wayfair Cost'].replace( '$', '').strip()) price_diff = abs(option_price_value - wf_price) if (current_diff == Decimal(0)) or (price_diff < current_diff): current_sku = str(row['Item Number']) current_diff = price_diff sku = current_sku product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name + ' ' + option['desc']) product_loader.add_value('sku', sku) identifier = response.xpath( '//input[@name="sku"]/@value').extract()[0] product_loader.add_value( 'identifier', identifier + '-' + option['identifier']) product_loader.add_value('brand', brand) product_loader.add_value('category', categories) if image_url: product_loader.add_value('image_url', image_url[0]) product_loader.add_value('url', response.url) product_loader.add_value('price', option_price_value) product = product_loader.load_item() metadata = HouseholdEssentialsMeta() metadata['reviews'] = [] product['metadata'] = metadata products_collected.append(product) sku_list.append(product['identifier']) else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name) product_loader.add_value('sku', sku) product_loader.add_xpath('identifier', '//input[@name="sku"]/@value') product_loader.add_value('brand', brand) product_loader.add_value('category', categories) if image_url: product_loader.add_value('image_url', image_url[0]) price = response.xpath( '//span[@data-id="dynamic-sku-price"]/text()').extract_first() #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = HouseholdEssentialsMeta() metadata['reviews'] = [] product['metadata'] = metadata products_collected.append(product) sku_list.append(product['identifier']) transaction_id = re.findall(r'"transactionID":"(.*)",', response.body)[0] headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': response.url, 'X-Requested-With': 'XMLHttpRequest' } params = urlencode({ 'bpss': 'yes', 'skulist': '~^~'.join(sku_list), 'kitmode': '0', 'postalcode': '67346', '_txid': transaction_id }) yield Request(self.ajax_stock_url + '?' + params, headers=headers, dont_filter=True, meta={ 'product': products_collected, 'prod_id': prod_id, 'prod_url': response.url }, callback=self.parse_stock)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) prod_lists = hxs.select( '//div[@class="product_list"]/div/h3/a/@href').extract() if prod_lists: for url in prod_lists: url = urljoin_rfc(get_base_url(response), url) yield Request(url) products = hxs.select( u'//table[child::tr[child::td[@colspan="2" and child::h2]]]') if products: try: category = hxs.select('//div[@class="page-heading"]/h1/text()' ).extract()[0].strip() except: try: category = hxs.select( '//div[@id="frag"]//text()').extract()[0].strip() except: category = hxs.select( '//p[@class="text_breadcrumbs"]//text()').extract( ).pop() for product in products: try: image_url = urljoin_rfc( base_url, product.select('.//img/@src').extract()[0]) except: image_url = '' multiple_options = product.select(u'.//select/option') general_price = product.select( u'.//span[@class="actlarge"]/text()').extract() general_price = general_price[0] if general_price else None if not general_price: general_price = product.select(u'.//*/text()').re( u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)') general_price = str(round(float(general_price[0]) / 1.2, 2)) if general_price else None log.msg(u'Product with: Price inc UK Mainland Carriage') if multiple_options and general_price: options_text = u' '.join( product.select(u'.//select/option/text()').extract()) if u'\xa3' in options_text: log.msg( u'Product with both option and general price: [%s]' % response.url) name = product.select(u'.//h2/text()')[0].extract().strip() name_complete = ''.join(product.select(u'.//h2//text()').extract()) if 'special offer' in name.lower(): special_offer_starts_at = name.lower().index('special offer') new_name = name[:special_offer_starts_at].strip() if 'ref:' in new_name.lower(): self.log("Found special offer") self.log("Before: '%s'" % name) self.log("After: '%s'" % new_name) name = new_name.replace(u' (Ref', u' \xa0(Ref') if multiple_options and not general_price: idx = 1 for option in multiple_options: option_text = option.select(u'./text()')[0].extract() loader = ProductLoader(item=Product(), selector=product) price = re.search(u'\xa3([\d\.,]+)', option_text) if price: price = price.group(1) else: continue regex = r'[\d]{1,2},[\d]{2}' if re.search(regex, price): price = price.replace(',', '.') loader.add_value('name', name + u' %s' % option_text.strip()) loader.add_value('category', category) loader.add_value('image_url', image_url) loader.add_value('url', response.url) loader.add_value('price', price) m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I) if m: optsku = option_text.strip().lower().replace( 'code', '').strip('-. ').split('-')[0] if optsku: loader.add_value('sku', m.group(1) + optsku) else: loader.add_value('sku', m.group(1) + ".inc" + str(idx)) idx += 1 loader.add_value('identifier', loader.get_output_value('sku')) if loader.get_output_value('sku') not in INVALID_PRODUCTS: yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('category', category) loader.add_value('image_url', image_url) if not general_price: continue regex = r'[\d]{1,2},[\d]{2}' if re.search(regex, general_price): general_price = general_price.replace(',', '') loader.add_value('price', general_price) m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I) if m: loader.add_value('sku', m.group(1)) loader.add_value('identifier', loader.get_output_value('sku')) # if loader.get_output_value('price'): if loader.get_output_value('sku') not in INVALID_PRODUCTS: yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) brand = hxs.select( '//span[@class="title-designer-info"]/a/text()').extract() brand = brand[0] if brand else '' options = re.search('var spConfig = new Product.Config\((.*})\);', response.body) options = json.loads(options.group(1)) if options else None if options: product_name = options['productName'] price = options['basePrice'] image_url = options['imageUrl'] identifier = options['productId'] else: product_name = hxs.select( '//span[@itemprop="name"]/text()')[0].extract() price = hxs.select( '//form//p[@class="special-price"]/span[@class="price"]/text()' ).extract() if not price: price = hxs.select( '//form//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() price = price[0].replace('.', '').replace(',', '.') image_url = hxs.select('//img[@id="image-main"]/@src')[0].extract() identifier = hxs.select( '//input[@name="product"]/@value')[0].extract() product_loader = ProductLoader(item=Product(), selector=hxs) # url = 'http://www.retrofurnish.com/de/' + response.url.split('/')[-1] product_loader.add_value('url', response.url) product_loader.add_value('name', product_name) product_loader.add_value('brand', brand) product_loader.add_value('image_url', image_url) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category') or '') product_loader.add_value('sku', identifier) price = re.search('([\d\.]+)', price).group(1) product_loader.add_value('price', price) product_loader.add_value( 'shipping_cost', self.get_shipping_cost( float(product_loader.get_output_value('price')))) if not options: product = product_loader.load_item() if product['identifier'] in self.products_ids: product['name'] = self.products_ids[product['identifier']] else: self.products_ids[product['identifier']] = product['name'] yield product return option_names = {} for attr in options['attributes'].values(): for opt in attr['options']: for prod in opt['products']: option_names[prod] = option_names.get(prod, []) + [opt['label']] option_names = dict( map(lambda x: (x[0], ' '.join(x[1])), option_names.items())) for option in options.get('childProducts').iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value( 'name', '%s %s' % (product_name, option_names[option[0]])) product_loader.add_value('image_url', option[1]['imageUrl']) product_loader.add_value('identifier', option[0]) product_loader.add_value('sku', identifier) product_loader.add_value('brand', brand) product_loader.add_value('category', response.meta.get('category') or '') product_loader.add_value('price', option[1]['finalPrice']) product_loader.add_value( 'shipping_cost', self.get_shipping_cost( float(product_loader.get_output_value('price')))) product = product_loader.load_item() if product['identifier'] in self.products_ids: product['name'] = self.products_ids[product['identifier']] else: self.products_ids[product['identifier']] = product['name'] yield product
def parse_products(self, response): data = json.loads(response.body) products = data['response']['products'] if products: u_id = response.meta['u_id'] u_cat = response.meta['u_cat'] offset = response.meta['offset'] for product in products: product_loader = ProductLoader(item=Product(), response=response) if product['price']: product_loader.add_value('identifier', product['id']) product_loader.add_value('name', product['title']) product_loader.add_value('sku', product['id']) price = product['price']['value'].replace(' ', '').replace( '.', '').replace(',', '.') product_loader.add_value('price', price) product_loader.add_value( 'image_url', response.urljoin(product['featured_image']['source'])) product_loader.add_value('url', product['url']) product_loader.add_value('brand', product['brand']['name']) if product['variants'][0]['inventory_quantity'] == '0': product_loader.add_value('stock', 0) product_loader.add_value('category', product['category']) exclusive_online = False metadata = SonaeMeta() promo = False for tag in product['tags']: if u'promo' in tag['title'].lower(): promo = True if u"PromoçãoOnline" in tag['title'].title().replace( ' ', ''): exclusive_online = True if self.meta_df is not None and not self.meta_df.empty and product[ 'id'] in self.meta_df.index: prev_meta = self.meta_df.loc[product['id']] else: prev_meta = {} promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata[ 'promo_end'] = today if not promo_end else promo_end if exclusive_online: metadata['exclusive_online'] = 'Yes' item = product_loader.load_item() item['metadata'] = metadata yield item yield scrapy.Request( 'http://www.phonehouse.pt/api.php/getProducts/' + u_id + '/' + u_cat + '/' + str(offset + 12), callback=self.parse_products, meta={ 'u_id': u_id, 'u_cat': u_cat, 'offset': offset + 12 })
def parse_product(self, response): l = ProductLoader(item=Product(), response=response) metadata = SonaeMeta() l.add_xpath('image_url', '//img[contains(@class, "product-detail-img-main")]/@src') l.add_value('url', response.url) name = response.xpath('//h1/text()').extract()[0].strip() #name_desc = ''.join(hxs.select('//span[@class="infoDet"]/text()').extract()).strip() #l.add_value('name', name + ' ' + name_desc) l.add_value('name', name) price = ''.join(response.xpath('//span[@class="item-price"]/text()').extract()[0].strip().split()) l.add_value('price', extract_price(price)) out_of_stock = response.xpath(u'//div[@class="product-btns-panel"]/button[contains(text(), "Indisponível")]') if out_of_stock: l.add_value('stock', 0) categories = response.xpath('//ol[@class="breadcrumb"]/li/a/text()').extract()[1:] for category in categories: l.add_value('category', category) brand = response.xpath('//div[h1]/h3/text()').extract() if brand: l.add_value('brand', brand[0]) ''' weight = response.xpath('//div[h2[contains(text(), "Peso")]]/p/text()').extract() if not weight: weight = response.xpath('//tr[td[contains(text(), "Peso")]]/td/@txt').extract() weight = extract_price(weight[0]) if weight else 0 shipping = 0 if weight>=0.5 and weight<3: shipping = 2 if weight>=3 and weight<5: shipping = 4 if weight>=5 and weight<10: shipping = 5 if weight>=10 and weight<20: shipping = 10 if weight>=20: shipping = 15 if shipping: l.add_value('shipping_cost', shipping) ''' identifier = response.xpath('//input[@name="Id"]/@value').extract() l.add_value('identifier', identifier[0]) l.add_value('sku', identifier[0]) if self.meta_df is not None and not self.meta_df.empty and identifier[0] in self.meta_df.index: prev_meta = self.meta_df.loc[identifier[0]] else: prev_meta = {} promo = response.xpath('//span[@class="item-old-price"]/span[@class="item-old-price"]/text()') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: metadata['promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end item = l.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): log.msg(response.url) base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) identifier = response.url.split('/')[-1].split('-')[-1].split('.')[0] log.msg('Identifier: %s' % identifier) log.msg(repr(self.seen_ids)) if identifier in self.seen_ids: return else: self.seen_ids.append(identifier) loader.add_value('identifier', identifier) sku = hxs.select('//p[@class="pmeta"]/text()').re('(\d+)') loader.add_value('sku', sku) name = hxs.select('//div[@class="prod-box"]/h1//text()').extract() extra_data = name[1].strip() if len(name) > 1 else '' loader.add_value('name', name[0]) #price price = re.sub( '[\r\n\t]+', ' ', hxs.select( '//h5[@class="product-price"]//div[contains(@id,"StaticPrice")]/span/text()[normalize-space()]' )[0].extract()) loader.add_value('price', price) #image_url image_url = hxs.select('//img[@class="product-image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) #brand loader.add_value('brand', 'Le Creuset') #category category = hxs.select('//ul[@class="breadcrumbs"]')[0].select( './/a/text()').extract() loader.add_value('category', ' > '.join(category[2:])) #shipping_cost price = Decimal(loader.get_output_value('price')) if price < 20.00: loader.add_value('shipping_cost', '2.00') elif 20.00 <= price < 40.00: loader.add_value('shipping_cost', '4.99') product = loader.load_item() options = hxs.select('.//select/option[contains(@class,"%s")]' % identifier) if options: sid = hxs.select( '//input[@type="hidden" and @name="SID"]/@value')[0].extract() stock_url = 'http://www.hartsofstur.com/cgi-bin/st000001.pl?ACTION=GETSTOCK&REF=%(identifier)s&SID=%(sid)s×tamp=%(timestamp)s' items = [] for option in options: item = copy.deepcopy(product) option_name = option.select('./text()')[0].extract().strip() option_identifier = option.select('./@class').re('_(\d+)_')[0] self.seen_ids.append(option_identifier) item['identifier'] = "%s_%s" % (identifier, option_identifier.strip()) item['name'] += ' %s %s' % (option_name, extra_data) item['name'] = item['name'].strip() items.append(item) yield Request(stock_url % { 'identifier': identifier, 'sid': sid, 'timestamp': int(time.time()) }, meta={'items': items}, callback=self.parse_stock) else: product['name'] += ' %s' % extra_data yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: data = re.findall( "var productJson = (.*);", hxs.select( '//script[contains(text(), "var productJson = ")]/text()'). extract().pop(), flags=re.DOTALL) data_json = demjson.decode(data[0].replace("\n", "").replace( "[,{", "[{")) except (TypeError, ValueError, XMLSyntaxError, demjson.JSONDecodeError) as e: request = self.retry(response, "Unknown error on " + response.url) if request: yield request return except IndexError: return name = data_json['productTitle'] if 'price' in data_json['priceData']: price = extract_price(data_json['priceData']['price']) identifier = data_json['productId'] sku = identifier category = " > ".join( hxs.select( '//div[@id="breadcrumbs"]/div/div/a[text()!="Home"]/text()'). extract()) colors = {} for color in data_json['availableColors']: if 'mainImageURL' in color: colors[color['id']] = color['mainImageURL'] img = data_json['mainImageURL'] warranty = hxs.select( '//select[@id="warranty_0"]/option/text()').extract() warranty_price = '' if warranty: warranty_price = re.search('Replacement - .([\d\.]+)', warranty[-1]) warranty_price = warranty_price.group(1) if warranty_price else '' biw_metadata = BIWMeta() biw_metadata['warranty'] = warranty_price for item in data_json['skus']: if item['price']: price = extract_price(item['price']) if not price: continue if item['colorId'] in colors: img = colors[item['colorId']] stock = 0 if item.get('size', '').lower() == 'one size': item['size'] = '' if item.get('color', '').lower() == 'one color': item['color'] = '' itemname = "%s %s %s" % (name, item.get('color', ''), item.get('size', '')) if "IN_STOCK" in item['avail']: stock = 1 if "NO_DISPLAY" in item['avail']: continue product = Product() product['category'] = category product['sku'] = sku product['url'] = response.url product['stock'] = stock product['metadata'] = biw_metadata if img: product['image_url'] = urljoin_rfc(base_url, img) loader = ProductLoader(item=product, response=response) loader.add_value('identifier', "%s-%s" % (identifier, item['sku_id'])) loader.add_value('name', itemname) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_name = hxs.select( '//span[@itemprop="name"]/text()').extract()[0].strip() try: product_brand = hxs.select( '//span[@itemprop="brand"]/text()').extract()[0].strip() except: product_brand = '' image_url = hxs.select('//img[@itemprop="image"]/@src').extract()[0] category = hxs.select( '//div[@class="breadcrumb"]//a/span[@itemprop="title"]/text()' ).extract()[-1] categories = hxs.select( '//ul[@class="simple"]/li/a/span[@itemprop="title"]/text()' ).extract() categories = categories[1:] if categories else [] identifier = response.url.split('/')[-1].split('-')[0] price = hxs.select( '//div[@itemprop="offers"]//ins[@itemprop="price"]/text()' ).extract()[0] product_url = response.url stock = hxs.select( '//div[contains(@class, "productDetail")]//div[contains(@class, "availability")]//strong[contains(@class, "available")]/i[@class="icon-ok"]' ).extract() sellers_url = hxs.select( '//a[contains(@href, "all_seller")]/@href').extract() dealer = hxs.select( '//p[contains(@class, "sellby")]/a/strong/text()').extract() if not dealer: dealer = hxs.select( '//p[contains(@class, "sellby")]/strong/text()').extract() if not dealer: dealer = hxs.select( '//section[@class="col3"]//p/strong/text()').extract() dealer = dealer[0].strip() if dealer else 'Pixmania.com' if self.pixmania_direct and dealer != 'Pixmania.com': return l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) if self.use_main_id_as_sku: l.add_value('sku', identifier) if self.append_brand_to_name: l.add_value('name', product_brand + ' ' + product_name) else: l.add_value('name', product_name) if not self.full_category_path: l.add_value('category', category) else: l.add_value('category', categories) l.add_value('brand', product_brand) l.add_value('url', product_url) l.add_value('image_url', image_url) if not stock: l.add_value('stock', 0) if not self.only_buybox and not self.pixmania_direct and sellers_url: item = l.load_item() yield Request(sellers_url[0].strip(), callback=self.parse_sellers, meta={'product': item}) else: l.add_value('price', self._encode_price(price)) l.add_value('dealer', 'Pix - ' + dealer) item = l.load_item() item['identifier'] += '-' + dealer if self.collect_reviews: reviews_url = add_or_replace_parameter( self.reviews_url, 'filter.q0', 'productid:eq:%s' % identifier) reviews_url = add_or_replace_parameter(reviews_url, 'offset.q0', '0') yield Request(reviews_url, meta={'products': [item]}, callback=self.parse_reviews) else: yield item
def parse_item(self, response): '''Parse page of particular product''' hxs = HtmlXPathSelector(response) page_title = hxs.select("//title/text()").extract()[0] product_category = hxs.select( "//div[@id='ProductBreadcrumb']/ul/li/a/text()").extract()[1] product_name = hxs.select( "//div[@id='ProductDetails']//h1/text()").extract()[0] product_price = hxs.select( "//span[@class='ProductDetailsPriceIncTax']/text()").extract() product_id = hxs.select( "//input[@name='product_id']/@value").extract()[0] product_brand = product_brand = response.meta['brand'] product_image = hxs.select( "//div[@class='ProductThumbImage']/a/img/@src").extract()[0] product_details = hxs.select( "//form[@id='productDetailsAddToCartForm']").extract() options_to_check = [] possible_options = {} product_option_attributes = [] if product_price: product_price = product_price[0].encode('utf-8') price_pattern = '[0-9,]+\.[0-9]{2}' product_price = re.findall(re.compile(price_pattern), product_price)[0] product_price = float(re.sub(',', '', product_price)) stock_status = 1 else: product_price = 0.00 stock_status = 0 if product_details: product_options = hxs.select( "//form[@id='productDetailsAddToCartForm']//div[@class='productOptionViewSelect']" ) product_attribute_labels = hxs.select( "//form[@id='productDetailsAddToCartForm']//div[@class='productAttributeLabel']//span[@class='name']/text()" ).extract() product_attribute_labels = [ re.compile(r'[\n\r\t]').sub('', product_attribute_label) for product_attribute_label in product_attribute_labels ] for num, product_option in enumerate(product_options): product_option_attribute = product_option.select( "select[@class='validation']/@name").extract()[0] product_option_attributes.append(product_option_attribute) product_option_data = product_option.select( "*/option[not(@value='') and not(contains(text(), 'None'))]" ).extract() product_option_values_set = [ ''.join(re.findall(re.compile('value=\"(.+?)\"'), i)) for i in product_option_data ] product_option_titles_set = [ ''.join(re.findall(re.compile('>(.+?)<\/option>'), i)) for i in product_option_data ] possible_options[str(num + 1)] = product_option_values_set + [''] for a in range(len(product_option_values_set)): tmp_dict = {} tmp_dict[product_option_titles_set[ a]] = product_option_values_set[a] options_to_check.append(tmp_dict) # On the website there only products with 0, 1 and 2 numbers of options if len(product_option_attributes) == 2: for value_01 in possible_options.get('1'): for value_02 in possible_options.get('2'): post_data = { 'actions': 'add', 'product_id': product_id, product_option_attributes[0]: value_01, product_option_attributes[1]: value_02, 'w': 'getProductAttributeDetails' } yield FormRequest('http://www.sxpro.co.uk/remote.php', formdata=post_data, method='POST', callback=self.handle_json_response, meta={ 'value_01': value_01, 'value_02': value_02, 'num_options': 2, 'options': options_to_check, 'product_name': product_name, 'product_id': product_id, 'stock_status': stock_status, 'product_url': response.url, 'product_image': product_image, 'category': product_category, 'product_brand': product_brand }, dont_filter=True) elif len(product_option_attributes) == 1: for value_01 in possible_options.get('1'): post_data = { 'actions': 'add', 'product_id': product_id, product_option_attributes[0]: value_01, 'w': 'getProductAttributeDetails' } yield FormRequest('http://www.sxpro.co.uk/remote.php', formdata=post_data, method='POST', callback=self.handle_json_response, meta={ 'value_01': value_01, 'num_options': 1, 'options': options_to_check, 'product_name': product_name, 'product_id': product_id, 'stock_status': stock_status, 'product_url': response.url, 'product_image': product_image, 'category': product_category, 'product_brand': product_brand }, dont_filter=True) else: l = ProductLoader(item=Product(), response=response) l.add_value('price', product_price) l.add_value('stock', stock_status) l.add_value('identifier', product_id) l.add_value('category', product_category) l.add_value('url', response.url) l.add_value('name', product_name) l.add_value('image_url', product_image) l.add_value('brand', product_brand) yield l.load_item()
def parse_item(self, response): hxs = HtmlXPathSelector(response) name = response.meta['name'] url = response.url sku = hxs.select( "//p[@itemprop='identifier']/@content").extract()[0].replace( 'sku:', '').strip() brand = ''.join( hxs.select("//span[@itemprop='brand']/text()").extract()) image_url = ''.join( hxs.select("//div[@class='product-img-box']//img[@id='image']/@src" ).extract()) categories = hxs.select("//div[@class='breadcrumbs']//a")[1:] categories = [ category.select("./span/text()").extract()[0] for category in categories ] try: stock = 1 if hxs.select("span[@itemprop='availability']/text()" ).extract()[0].strip() == 'In stock' else 0 except: stock = 0 if 'out of stock' in response.body.lower() else 1 if not categories: categories = response.meta['categories_tmp'] try: options = json.loads( re.findall( re.compile('\"productConfig\":(.+?),"productAttributes'), response.body)[0]) options_names = json.loads( '{' + re.findall(re.compile('{\"attributes\":.*(\"options.+?}]})'), response.body)[0])['options'] options_names = dict((k, options_name['label']) for options_name in options_names for k in options_name['products']) except Exception as e: logging.error('No options found') options = None if options: for option in options: l = ProductLoader(item=Product(), response=response) option_id = option saving_price = re.findall( re.compile('>(.+?)<'), options[option]['saving_price'])[0].encode( 'ascii', 'ignore') retail_price = re.findall( re.compile('>(.+?)<'), options[option]['retail_price'])[0].encode( 'ascii', 'ignore') option_price = round( float(retail_price) - float(saving_price), 2) sku_tmp = sku + '-' + str(option_id) option_name = options_names.get(option_id) try: options[option]['stockAlertUrl'] stock = 0 except: stock = 1 l.add_value('image_url', image_url) l.add_value('url', url) l.add_value('price', option_price) l.add_value('stock', stock) l.add_value('brand', brand) l.add_value('identifier', sku_tmp) l.add_value('sku', sku_tmp) l.add_value('name', name + ' ' + option_name) for category in categories: l.add_value('category', category) yield l.load_item() else: l = ProductLoader(item=Product(), response=response) price = hxs.select( "//span[@class='regular-price']/span[@class='price']/text()" ).extract()[0].strip()[1:].replace('[', '').replace(']', '') l.add_value('image_url', image_url) l.add_value('url', url) l.add_value('price', price) l.add_value('stock', stock) l.add_value('brand', brand) l.add_value('identifier', sku) l.add_value('sku', sku) l.add_value('name', name) for category in categories: l.add_value('category', category) yield l.load_item()
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) #cats = hxs.select(u'//div[@id="RightColumn"]/table/tr/td/center/div[@class="contentsName"]/a/@href').extract() products = hxs.select('//h2[@class="product-name"]/a/@href').extract() if products: for url in products: #if url.split('.')[-1].lower() not in ('htm', 'html'): # Contains links to PDFs as well # continue #url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list) else: opt_groups = [] # def fix_options(what, o): # try: # return (what + ':' + o[0], o[1].replace(',', '')) # except: # return (what + ':' + o[0], '0') option_names = hxs.select( '//fieldset[@class="product-options"]/dl/dt/label/text()' ).extract() for i, option in enumerate( hxs.select( '//select[contains(@class, "product-custom-option") or contains(@class, "required-entry")]' )): what = option_names[i].strip().replace(':', '') opt_list = option.select( u'./option[@value!="PleaseSelect" and @value!="Please Select" and text()!=""]/text()' ).extract()[1:] option_ids = option.select( u'./option[@value!="PleaseSelect" and @value!="Please Select" and @value!=""]/@value' ).extract() opt_list = map(lambda x, y: x + [y], [ o.split('+') if len(o.split('+')) > 1 else o.split('+') + ['0'] for o in opt_list ], option_ids) if opt_list: opt_groups.append( [self.fix_options(what, o) for o in opt_list]) # Extract option from JavaScript code try: js_options = '' for line in response.body.split('\n'): if "spConfig = new Product.Config(" in line: js_options = line.split( 'spConfig = new Product.Config(')[1].split(');')[0] json_options = json.loads(js_options) for item in json_options['attributes'].iteritems(): options = item[-1]['options'] option_ids = [] opt_list = [] for option in options: option_ids.append(option['id']) opt_list.append(option['label'] + '+' + option['price']) what = option_names[i].strip().replace(':', '') opt_list = map(lambda x, y: x + [y], [ o.split('+') if len(o.split('+')) > 1 else o.split('+') + ['0'] for o in opt_list ], option_ids) opt_groups.append( [self.fix_options(what, o) for o in opt_list]) except: log.msg('No JSON options: ' + response.url) if len(opt_groups) > 4: self.log("WARNING: Too many options, using base price only") opt_groups = [] for opt_name, opt_price, opt_id in multiply(opt_groups): product_loader = ProductLoader(item=Product(), selector=hxs) ''' if not hxs.select(u'//div[@class="buybox"]'): self.log("WARNING: NOT A PRODUCT") return ''' product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') if hxs.select( '//tr[td/text()="Sale Price"]/td[text()!="Sale Price"]/text()' ): #FIXME: fix the other prices product_loader.add_xpath( 'price', u'//tr[td/text()="Sale Price"]/td[text()!="Sale Price"]/text()' ) elif hxs.select('//td/span[@class="price"]/text()'): product_loader.add_xpath( 'price', u'//td/span[@class="price"]/text()') else: product_loader.add_xpath( 'price', u'//div[@class="itemRegPrice"]/span/font/text()') sku = hxs.select('//tr[th/text()="MPN"]/td/text()').extract() sku = sku[0] if sku else '' product_loader.add_value('sku', sku) product_loader.add_xpath( 'category', u'//div[@class="breadcrumbs"]/ul/li[contains(@class, "category")]/a/text()' ) product_loader.add_xpath( 'image_url', u'//div[@class="product-img-box"]//div[@class="prolabel-wrapper"]/a/img/@src' ) # product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")') product_loader.add_value('shipping_cost', '') identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] if opt_id: product_loader.add_value('identifier', identifier + '-' + opt_id) else: product_loader.add_value('identifier', identifier) product = product_loader.load_item() product['name'] = (product['name'] + ' ' + opt_name).strip() if not 'price' in product: product['price'] = Decimal(0) self.log('ERROR price is not set, setting to default 0') else: product['price'] = product['price'] + Decimal(opt_price) yield product next = hxs.select('//a[@class="next i-next"]/@href').extract() if next: yield Request(next[0], callback=self.parse_product_list)
def parse(self, response): hxs = HtmlXPathSelector(response) URL_BASE = get_base_url(response) categories = hxs.select( "//nav[contains(@class, 'section_nav')]/ul/li//a/@href").extract() for url in categories: url = urljoin_rfc(URL_BASE, url) request = Request(url, callback=self.parse) yield request pages = hxs.select("//ul[@class='pagination']//a/@href").extract() for url in pages: url = urljoin_rfc(URL_BASE, url) request = Request(url, callback=self.parse) yield request category = hxs.select('//span[@class="current"]/text()').extract()[0] items = hxs.select("//article[contains(@class, 'product')]") for item in items: name = item.select( ".//div/header[@class='productTitle']/a/text()").extract() if not name: continue name = name[0].strip() name = re.sub("[\s]+", " ", name) identifier = item.select( 'div/div/input[@name="sFUPID"]/@value').extract() if identifier: identifier = identifier[0] else: identifier = item.select( 'div[@class="productAdditional"]/p/@id').extract() if identifier: identifier = identifier[0].split('desc_')[-1] else: identifier = '' url = item.select( ".//div/div/header[@class='productTitle']/a/@href").extract() if not url: logging.error("ERROR! NO URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] price = item.select( ".//div//span[@class='currentPrice']/ins/text()").extract() if not price: logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0].strip() l = ProductLoader(item=Product(), selector=item) l.add_value('name', name) l.add_value('url', url) l.add_value('identifier', identifier) l.add_value('price', price) l.add_value( 'category', hxs.select('//span[@class="current"]/text()').extract()[0]) l.add_xpath('image_url', 'a/img/@src') yield l.load_item()
def parse_cat(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) subcats = hxs.select( '//div[contains(@class,"category-fourgrid") or contains(@class,"sub-category-grid")]//a/@href' ).extract() productsxs = hxs.select( '//div[contains(@class,"product-list-row") and div[contains(@class, "product-info")]]' ) if not subcats and not productsxs: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 new_req = response.request.copy() new_req.meta['retry'] = retry new_req.dont_filter = True yield new_req return for url in subcats: yield Request(urljoin_rfc(base_url, url), callback=self.parse_cat, meta=response.meta) for productxs in productsxs: product_options_link = productxs.select( './/div[@class="form-row"]/a/@href').extract() if product_options_link: yield Request(urljoin_rfc(base_url, product_options_link[0]), callback=self.parse_cat, meta=response.meta) else: loader = ProductLoader(item=Product(), selector=productxs) loader.add_value( 'price', ''.join( productxs.select( './/div[@class="price"]//text()').extract())) if productxs.select( './/img[@alt="In stock" or contains(@alt,"days delivery") or contains(@alt,"Day Delivery") or contains(@alt,"Hour Delivery")]' ): loader.add_value('stock', 1) else: loader.add_value('stock', 0) loader.add_xpath('identifier', './/p[@class="code"]/text()') product_url = productxs.select( './/h3[@class="product-name"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, product_url)) loader.add_xpath('name', './/h3[@class="product-name"]/a/text()') loader.add_value('category', response.meta.get('category')) loader.add_value( 'sku', self.map_sku(''.join( productxs.select( './/p[@class="code"]/text()').extract()))) img = productxs.select( './/div[@class="primaryImageDiv"]//img/@src').extract() if img: loader.add_value( 'image_url', urljoin_rfc(base_url, img[0].replace('/medium/', '/large/'))) loader.add_xpath('brand', './/img[@class="brand-image"]/@alt') brand = loader.get_output_value('brand').strip().upper() if brand in self.ignore_brands: log.msg('Ignoring %s product: %s' % (brand, response.url)) return item = self.add_shipping_cost(loader.load_item()) if item.get('identifier', '').strip(): yield item for url in hxs.select('//ul[@class="pager"]//a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_cat, meta=response.meta)
def parse_product(self, response): hxs = HtmlXPathSelector(response) try: brand_name = hxs.select( '//*[@itemprop="brand"]/text()').extract()[0] name = hxs.select('//h1[@class="product-title"]/span/text()' ).extract()[-1].strip() except: return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value('name', brand_name + ' ' + name) product_loader.add_xpath( 'sku', u'//div[@class="additional-product-no" and contains(text(), "Manufacturer Item no.")]', re=r'Manufacturer Item no\. (.*)') product_loader.add_xpath('identifier', u'//body/@data-vw-id') price = hxs.select( '//div[contains(@class, "artikel-detail")]//*[@itemprop="price"]/text()' ).extract() if not price: price = '' else: price = price[0] product_loader.add_value('price', price.replace('.', '').replace(',', '.')) category = hxs.select( '//nav[@id="breadcrumb"]//a/text()').extract()[-1] product_loader.add_value('category', category) product_loader.add_value('brand', brand_name.strip()) try: image_url = urljoin_rfc( base_url, hxs.select('//img[@itemprop="image"]/@src').extract()[0]) product_loader.add_value('image_url', image_url) except: pass product = product_loader.load_item() options = hxs.select( '//div[@id="variantselector"]//tr[@class="variant"]') if options: for opt in options: p = Product(product) try: p['name'] = p['name'] + ' ' + opt.select( u'.//td[2]/label/text()').extract()[0] except IndexError: # No option name extension pass p['identifier'] = p['identifier'] + '-' + opt.select( u'.//input/@value').extract()[0] if p['identifier'] not in self.idents: self.idents.append(p['identifier']) yield p else: if product['identifier'] not in self.idents: self.idents.append(product['identifier']) yield product
def parse_product(self, response): url = response.url l = ProductLoader(item=Product(), response=response) name = response.xpath('//span[@itemprop="name"]/text()').extract() try: name = name[0].strip() except IndexError: retry = response.meta.get('retry', 0) if retry <= 3: yield Request(response.url, dont_filter=True, callback=self.parse_product, meta={'retry': retry + 1}) l.add_value('name', name) price = response.xpath( '//p[@class="special-price"]/span[@class="price"]/text()').extract( ) if price: price = price[0] else: price = response.xpath( '//span[@class="regular-price"]/span[@class="price"]/text()' ).extract() if price: price = price[0] l.add_value('price', price) sku = response.xpath( '//div[@class="product-shop--sku"]/h4/span/text()').extract() l.add_value('sku', sku[0]) identifier = response.css( 'div.nosto_product span.product_id::text').extract( ) or response.xpath('//input[@id="entity_id"]/@value').extract() l.add_value('identifier', identifier[0]) l.add_value('category', response.meta.get('category', '')) image_url = response.xpath( '//span[@class="image_url"]/text()').extract() l.add_value('image_url', image_url) l.add_value('url', url) l.add_xpath('brand', '//span[@class="brand"]/text()') out_of_stock = response.xpath( '//div[contains(@class, "availability-box")]/p[contains(@class, "out-of-stock")]' ) if out_of_stock: l.add_value('stock', 0) item = l.load_item() options = response.xpath('//table[@id="super-product-table"]/tbody/tr') if options: for option in options: option_item = deepcopy(item) option_item['name'] = option.xpath('td[1]/text()').extract()[0] price = option.xpath( 'td//span[@class="price"]/text()').extract() price = extract_price(price[0]) if price else 0 option_item['price'] = price identifier = option.xpath('td//input/@name').re('\[(.*)\]') if not identifier: identifier = option.xpath('td//span/@id').re( 'product-price-(.*)') option_item['stock'] = 0 option_item['identifier'] += '-' + identifier[0] yield option_item else: yield item
def parse_product(self, response): if response.status == 405: url = response.meta['redirect_urls'][0] retries = response.meta.get('retries', 0) if retries >= 9: self.logger.error( 'Gave up retrying avoid antibot captcha for %s' % url) return self.logger.debug('DistilNetworks antibot captcha. Retrying %s' % url) yield response.request.replace(dont_filter=True, url=url, meta={ 'retries': retries + 1, 'dont_merge_cookies': True }) return if response.url in self.old_urls: self.old_urls.remove(response.url) options_data = response.xpath( "//div[@class='v2-product-subproducts']//@data").extract() if options_data: options_data = json.loads(options_data[0]) product_name = options_data['name'] if not options_data.get('sku', 0): pass else: if options_data['sub_products']: for sub_option in options_data: loader = ProductLoader(item=Product(), response=response) price = extract_price( sub_option['prices']['price']['amount']) loader.add_value('url', response.url) option_name = sub_option['option1'] loader.add_value( 'name', "{product} {option}".format(product=product_name, option=option_name)) loader.add_value('stock', sub_option['stock']['is_in_stock']) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath( 'brand', "//div[@class='v2-gallery-block']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', sub_option['sku']) loader.add_value('identifier', sub_option['sku']) loader.add_xpath( 'image_url', sub_option['main_image']['large_path']) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()" ).extract() metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: loader = ProductLoader(item=Product(), response=response) price = extract_price( options_data['prices']['price']['amount']) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('name', product_name) loader.add_value('stock', options_data['stock']['is_in_stock']) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath( 'brand', "//div[@class='v2-gallery-block']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', options_data['sku']) loader.add_value('identifier', options_data['sku']) loader.add_value('image_url', options_data['main_image']['large_path']) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: product_name = response.xpath( "//h1[@class='fn']//text()").extract()[0] options = response.xpath( "//div[contains(@class, 'sub-products')]/div") sku = ''.join( response.xpath( "//form[@name='notifications']//input[@name='p']/@value"). extract()) if options: for sub_option_2 in options: sku_option = ''.join( sub_option_2.xpath("./label/@data-sub-sku").extract()) loader = ProductLoader(item=Product(), response=response) price = extract_price( sub_option_2.xpath("./label/@data-subprice").extract() [0]) if not price: price = extract_price(''.join( response.xpath( '//p[@class="price-info"]//span[@class="Price"]/text()' ).extract()).strip()) loader.add_value('price', price) loader.add_value('url', response.url) option_name = sub_option_2.xpath( "./label/@data-option").extract()[0] loader.add_value( 'name', u"{product} {option}".format(product=product_name, option=option_name)) stock = ''.join( sub_option_2.xpath( "./label/@data-stock").extract()).strip().lower() if stock in ['limited', 'in stock']: stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_value('sku', sku_option) loader.add_value('identifier', '{}_{}'.format(sku, sku_option)) img = ''.join( sub_option_2.xpath("./data-image-large").extract()) if not img: img = ''.join( response.xpath( "//img/@data-original-large").extract()) loader.add_value('image_url', 'http:' + img) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product return options = response.xpath('//option[@data-name]') if options: for opt in options: loader = ProductLoader(item=Product(), response=response) product_image_json = opt.xpath('@data-image').extract() if product_image_json: product_image_data = json.loads(product_image_json[0]) loader.add_value('image_url', product_image_data['default']) product_stock = opt.xpath('@data-stock').extract()[0] if product_stock == 'Out of Stock': loader.add_value('stock', 0) option_name = opt.xpath('@data-name').extract()[0] loader.add_value('name', product_name + ' ' + option_name) price_data = json.loads( opt.xpath('@data-price').extract()[0]) loader.add_value('price', price_data['price']) option_sku = opt.xpath('@value').extract()[0] loader.add_value('sku', option_sku) loader.add_value('identifier', sku + '_' + option_sku) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") loader.add_value('url', response.url) price = loader.get_output_value('price') if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product else: if not sku: pass else: loader = ProductLoader(item=Product(), response=response) price = ''.join( response.xpath( '//p[@class="price-info"]//span[@class="Price"]/text()' ).extract()).strip() if price == '': price = ''.join( response.xpath( "//span[@class='Price ']//span[@class='Price-integer' or @class='Price-decimal']//text()" ).extract()) if price == '': self.log("Error! No price! URL: {}".format( response.url)) return price = extract_price(price) loader.add_value('url', response.url) loader.add_value('name', product_name) stock = ''.join( response.xpath("//span[@class='stock-level']//text()"). extract()).strip() if stock.lower() in ['limited', 'in stock']: stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()" ) loader.add_xpath('brand', "//a[@class='product-brand']//img/@alt") if price < 10: shipping_cost = extract_price('2.95') else: shipping_cost = 0 # Add shipping cost to product price loader.add_value('shipping_cost', shipping_cost) loader.add_value('price', price + shipping_cost) loader.add_xpath( 'sku', "//form[@name='notifications']//input[@name='p']/@value" ) loader.add_xpath( 'identifier', "//form[@name='notifications']//input[@name='p']/@value" ) loader.add_xpath('image_url', "//img/@data-original-large") product = loader.load_item() promotion = response.xpath( "//div[@id='product-offer-tab']//h3//text()").extract( ) metadata = FragranceDirectMeta() if promotion: metadata['promotion'] = promotion[0] if product.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') product['metadata'] = metadata yield product
def parse_product(self, response): if response.status == 504 or response.status == 500 and response.meta.get( 'retry', 0) < 3: port_number = re.findall(':\d+', response.url) if port_number: meta = response.meta.copy() meta['retry'] = meta.get('retry', 0) + 1 new_url = re.sub(":\d+", "", response.url) log.msg('ERROR >>> Redirect, port number in url : ' + new_url) yield Request(new_url, dont_filter=True, callback=self.parse_product, meta=meta) return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) identifier = hxs.select( '//span[@itemprop="productID"]/text()').extract() if not identifier: request = self.retry( response, "ERROR >>> No identifier for product URL: " + response.url) if request: yield request return identifier = identifier[0] json_data = re.findall("PRODUCT_METADATA_JSON = (.*);", response.body) check_options = response.meta.get('check_options', True) if json_data and check_options: json_data = demjson.decode( json_data[0])['attributeDefinition']['attributeLookup'] for value in json_data.values(): option_url = response.url.replace(identifier, str(value)) yield Request(option_url, callback=self.parse_product, meta={ 'check_options': False, 'dont_retry': True }) loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('url', response.url) price = hxs.select( '//div[@class="pricingReg"]/span[@itemprop="price"]/text()' ).extract() if not price: price = hxs.select('//span[@id="ajaxPrice"]/text()').extract() price = price[0] if price else 0 loader.add_value('price', price) loader.add_xpath('name', '//h1[@class="product-title__title"]/text()') image_url = hxs.select('//img[@id="mainImage"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) categories = [] json_breadcrumb = re.findall("var BREADCRUMB_JSON = (.*);", response.body) if json_breadcrumb: json_breadcrumb = demjson.decode(json_breadcrumb[0]) categories = json_breadcrumb['bcEnsightenData'][ 'contentSubCategory'].split('>') loader.add_value('category', categories) brand = hxs.select('//h2[@itemprop="brand"]/text()').extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) sku = response.xpath('//script/text()').re('"modelNumber":"(.+?)"') loader.add_value('sku', sku) if not loader.get_output_value( 'price') or 'OUT OF STOCK ONLINE' in response.body.upper(): loader.add_value('stock', 0) item = loader.load_item() discontinued = hxs.select('//span[@class="discontinuedItem show"]') if discontinued: item['price'] = 0 item['stock'] = 0 if item['identifier']: self.new_ids.append(item['identifier']) yield item
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta name = hxs.select('//h1[@itemprop="name"]/text()').extract() if not name: item = self.parse_dvf(response) if not item: self.errors.append("Name not found on " + response.url) else: yield item return l = ProductLoader(item=Product(), response=response) brand = hxs.select('//h2[@itemprop="brand"]/a/text()').extract()[0] l.add_value('name', brand + ' ' + name.pop()) url = meta.get('url') if meta.get('url', None) else response.url l.add_value('url', url) identifier = hxs.select( '//input[@id="productId" and @value!=""]/@value').extract() if not identifier: identifier = hxs.select('//*[@itemprop="sku"]/@content').extract() if not identifier: identifier = re.findall("product/([^/]*)/", url) if identifier: identifier = identifier[0] l.add_value('identifier', identifier) sku = meta.get('sku', None) if not sku: sku = hxs.select('//meta[@itemprop="sku"]/@content').extract() sku = sku[0] if sku else '' l.add_value('sku', sku) brand = meta.get('brand') if meta.get('brand', None) else brand l.add_value('brand', brand) image_url = hxs.select('//img[@id="medium-image"]/@src').extract() if image_url: l.add_value('image_url', urljoin_rfc(base_url, image_url[0])) l.add_value('category', meta.get('category')) price = hxs.select('//span[@itemprop="price"]/text()').extract() if price: price = extract_price(price[0]) else: price = 0 l.add_value('price', price) out_of_stock = hxs.select( '//div[@class="sold-out-message"]/span/text()').extract() if out_of_stock: l.add_value('stock', 0) yield l.load_item() colors = hxs.select( '//div[@id="alternative-colors"]/a/@href').extract() for color in colors: yield Request(urljoin_rfc(base_url, color[0]), callback=self.parse_product)
def parse(self, response): #inspect_response(response, self) #return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) price = 0 stock = 0 tmp = None tmp = hxs.select('//ul[@class="size-selector"]//label[1]/@data-weight').extract() if tmp: if response.url in ['http://www.thebodyshop.fr/parfums/best-sellers/eau-de-toilette-white-musk.aspx', ]: dd = tmp[1].split('#') else: dd = tmp[0].split('#') loader.add_value('identifier', dd[4]) loader.add_value('sku', dd[4]) price = extract_price(dd[5]) loader.add_value('price', price) else: sku = ''.join(hxs.select("//div[@data-sku]/@data-sku").extract()) loader.add_value('identifier', sku) loader.add_value('sku', sku) price = extract_price(''.join( hxs.select("//div[@data-sku]//p[contains(concat('',@class,''), 'price ')]//text()").extract() )) loader.add_value('price', price) name = '' tmp = hxs.select('//h1[@class="title"]/@title').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) else: log.msg('### No name at '+response.url, level=log.INFO) # stock if price: stock = 1 loader.add_value('stock', stock) #image_url tmp = hxs.select('//img[@class="product"]/@src').extract() if tmp: url = urljoin(response.url, tmp[0].strip()) loader.add_value('image_url', url) #brand loader.add_value('brand', 'THE BODY SHOP') #category tmp = hxs.select('//nav[@id="breadcrumb_product"]/ul/li/a/text()').extract() if len(tmp) > 1: for s in tmp[1:]: loader.add_value('category', s) #shipping_cost if price < 40: loader.add_value('shipping_cost', 5.95) product = loader.load_item() metadata = {} tmp = hxs.select("//div[@id='product-offers']/p[2]//text()").extract() product['metadata'] = ' '.join([x.strip() for x in tmp if x.strip()]) return product
def parse_product(self, response): if response.xpath('//div[@id="ResultSetItems"]'): for x in self.parse(response): yield x return first_name = ' '.join( response.xpath('//*[@id="itemTitle"]/text()').extract()).strip() if not first_name: return identifier = response.url.split('?')[0].split('/')[-1] try: category = response.xpath('//ul[@itemtype="http://schema.org/Breadcrumblist"]')[0]\ .xpath('.//span[@itemprop="name"]/text()').extract()[1:] except: category = [] if category and self.just_last_category: category = category.pop() seller_id = ''.join( response.xpath('.//*[contains(@class, "si-content")]' '//a/*[@class="mbg-nw"]/text()').extract()) brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h2/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Brand")]' '/following-sibling::*[1]/h3/text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Marke")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Hersteller")]' '/following-sibling::*[1]//text()').extract()) if not brand: brand = filter( lambda s: s.strip() != '', response.xpath( '//*[@class="attrLabels" and contains(text(), "Marque")]' '/following-sibling::*[1]//text()').extract()) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', first_name) product_loader.add_value('identifier', identifier) if self.id_as_sku: product_loader.add_value('sku', identifier) product_loader.add_value('category', category) product_loader.add_value('dealer', 'eBay - ' + seller_id) product_loader.add_value('brand', brand) product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src') product_loader.add_value('url', response.url) try: price = response.xpath( '//*[@id="prcIsum"]/text()').extract()[0].strip() except: try: price = response.xpath( '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip() except: try: price = re.search(r'"binPrice":".*([\d\.,]+)",', response.body).groups()[0] except: price = re.search(r'"bidPrice":".*([\d\.,]+)",', response.body).groups()[0] product_loader.add_value('price', self.extract_price(price)) # shipping cost try: shipping_cost = response.xpath( '//*[@id="shippingSection"]//td/div/text()').extract()[0] if shipping_cost: if 'free' in shipping_cost.lower(): product_loader.add_value('shipping_cost', 0) else: product_loader.add_value('shipping_cost', self.extract_price(shipping_cost)) except: pass # stock amount if self._extract_stock_amount: stock = '' try: in_stock = ''.join( response.xpath('//*[@id="qtySubTxt"]//text()').extract()) stock = '' for match in re.finditer(r"([\d]+)", in_stock): if len(match.group()) > len(stock): stock = match.group() if 'More than' in in_stock: stock = 11 except: pass if stock: product_loader.add_value('stock', stock) product_ = product_loader.load_item() options_variations = [] sel = Selector(text=response.body.replace('"', '')) try: json_var_map = unicode( sel.xpath('//*/text()').re(r'("menuItemMap":{.*}.*),' '"unavailableVariationIds"')[0]) except: pass else: try: variations = demjson.decode( '{' + re.sub(r',"unavailableVariationIds".*', '', json_var_map) + '}') menu_map = variations['menuItemMap'] for key, variation in variations['itemVariationsMap'].items(): if variation['traitValuesMap']: new_variation = {} for option, value in variation['traitValuesMap'].items( ): new_variation[option] = menu_map[str( value)]['displayName'] options_variations.append({ 'price': variation['price'], 'values': new_variation, 'stock': variation['quantityAvailable'], 'identifier': '%s:%s' % (identifier, key) }) except: retry_no = int(response.meta.get('retry_no', 0)) + 1 if retry_no <= 10: self.log('Retrying No. %s => %s' % (retry_no, response.url)) req = response.request.copy() req.meta['retry_no'] = retry_no req.dont_filter = True yield req else: self.log('Gave up retrying => %s' % response.url) return if options_variations: for model in options_variations: model_name = first_name + ' ' + \ ' '.join(opt_name.strip().lower() for o, opt_name in model['values'].items()) new_product = Product(product_) new_product['name'] = model_name new_product['identifier'] = model['identifier'] new_product['price'] = self.extract_price(model['price']) new_product['stock'] = model['stock'] yield new_product else: yield product_
def parse_product(self, response): hxs = HtmlXPathSelector(response) l = ProductLoader(item=Product(), response=response) identifier = hxs.select('//input[@name="itemId"]/@value').extract()[0] l.add_value('identifier', identifier) sku = hxs.select('//span[@id="MpsShortSku"]/text()').re('#(\w+).') sku = sku[0] if sku else '' l.add_value('sku', sku) brand = hxs.select('//span[@class="product-designer"]/text()').extract() brand = brand[0].strip() if brand else '' l.add_value('brand', brand) name = ''.join(hxs.select('//h1[@itemprop="name"]/text()').extract()).strip() l.add_value('name', name) l.add_value('url', response.url) image_url = hxs.select('//div[@class="img-wrap"]/img/@src').extract() if image_url: l.add_value('image_url', image_url[0]) l.add_value('category', response.meta.get('category', '')) price = hxs.select('//div[@class="price pos1priceDisplayStyleOverride"]/text()').extract() if price: price = price[0] else: price = hxs.select('//p[@itemprop="price"]/text()').extract() if price: price = price[0] else: price = hxs.select('//span[@itemprop="price"]/text()').extract() if price: price = price[0] if not price: price = 0 l.add_value('price', price) out_of_stock = hxs.select('//div[@class="cannotorder"]') if out_of_stock: l.add_value('stock', 0) base_item = l.load_item() sub_items = hxs.select('//div[@class="lineItem"]') if sub_items: for sub_item in sub_items: item = deepcopy(base_item) price = sub_item.select('.//div[@class="price pos1priceDisplayStyleOverride"]/text()').extract() if price: price = price[0] else: price = sub_item.select('.//p[@itemprop="price"]/text()').extract() if price: price = price[0] else: price = sub_item.select('.//span[@itemprop="price"]/text()').extract() if price: price = price[0] if not price: price = '0' item['price'] = extract_price(price) item['name'] = sub_item.select('.//h6/text()').extract()[-1].strip() sku = hxs.select('.//span[@id="MpsShortSku"]/text()').re('#(\w+).') item['sku'] = sku[0] if sku else '' identifier = sub_item.select('.//div/input[contains(@id, "prod")]/@value').extract() if not identifier: continue item['identifier'] = item['identifier'] + '-' + identifier[0] if item['identifier'] not in self.ids: self.ids.append(item['identifier']) yield item else: continue else: if base_item['identifier'] not in self.ids: self.ids.append(base_item['identifier']) yield base_item else: return
def parse_product(self, response): hxs = HtmlXPathSelector(response) data = response.xpath( '//script/text()[contains(., "product/data")]').extract_first() data = json.loads( re.search('product/data",[ \n]*({.+})', data).group(1)) price = ''.join( hxs.select( '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()' ).extract()) if not price: price = ''.join( response.xpath( '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()' ).extract()) # Some products are not available online and these have no price if price: stock_status = 1 if 'out of stock' in price.lower(): stock_status = 0 product_name = filter( lambda x: bool(x), map( unicode.strip, hxs.select('//h1[contains(@itemprop, "name")]//text()'). extract())) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('identifier', re.search(r'/(\d+)$', response.url).group(1)) loader.add_value('sku', response.meta['sku']) loader.add_value('brand', response.meta['brand']) categories = hxs.select( '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()' ).extract() categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) loader.add_value('url', response.url) loader.add_xpath( 'image_url', '//img[contains(@class, "js-product-primary-image")]/@src') try: loader.add_value( 'shipping_cost', data['buyingOptions']['shippingPrice']['displayPrice']) except KeyError: loader.add_css('shipping_cost', 'h2.js-shipping-primary-msg::text') loader.add_value('price', price) if not stock_status: loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = {} yield Request(self._get_reviews_url(item, 1), meta={ 'product': item, 'page': 1 }, callback=self.parse_product_reviews)
def parse_product_main(cls, response, self_product_ids, self_matched_identifiers): # log.msg(">>>>>>>>>>>>>>> PARSE PRODUCT >>>") hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[@id="productTitle"]/text()') price = response.xpath( u'//div[contains(@class, "bem-product-price")]/div[contains(@class, "bem-product-price")]//text()' ).re(r'[\d,.]+') if not price: price = response.xpath( u'//*[contains(@class, "unit-price")]/text()').re(r'[\d,.]+') if price: price = price[0] else: price = '0.0' price = extract_price(price) if not price: discontinued = bool( response.xpath( '//div[contains(@class, "discontinuedProduct")]')) retry = int(response.meta.get('retry', 0)) if (not discontinued) and retry < 20: meta = response.meta.copy() meta['retry'] = retry + 1 yield Request( response.url, meta=meta, dont_filter=True, callback=lambda r: cls.parse_product_main( r, self_product_ids, self_matched_identifiers)) if price: product_loader.add_value('price', price) product_loader.add_xpath( 'category', u'//ul[@id="breadcrumbs"]/li[2]/div/a/@title') product_loader.add_xpath( 'image_url', u'concat("http:", //img[@itemprop="image"]/@src)') product_loader.add_xpath( 'brand', u'//span[@itemprop="manufacturer"]/text()') # product_loader.add_xpath('shipping_cost', '') product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = cls.extract_rrp(response) product['metadata'] = metadata identifier = response.xpath( '//*[@id="quickBuyBox"]/form/input[@name="id"]/@value' ).extract() if identifier: # single option product prod = Product(product) prod['name'] = prod['name'] + ' ' + response.xpath( u'normalize-space(//*[@id="quickBuyBox"]/form/div[@class="option-text"]/text())' ).extract()[0] prod['name'] = prod['name'].strip() prod['identifier'] = identifier[0] prod['sku'] = identifier[0] if prod['identifier'] in self_product_ids: prod['name'] = self_product_ids[prod['identifier']] else: self_product_ids[prod['identifier']] = prod['name'] out_of_stock = response.xpath( '//span[contains(@class, "out-of-stock")]') if not out_of_stock: out_of_stock = response.xpath( '//div[@id="productAvailabilityMessage" and contains(@class, "out-of-stock")]' ) if out_of_stock: prod['stock'] = 0 self_matched_identifiers.add(prod['identifier']) yield prod else: # multiple options product option_groups = response.xpath( u'//select[@id="productOptionDropDown2"]/optgroup') if option_groups: for option_group in option_groups: label = option_group.xpath('@label').extract()[0] options = option_group.xpath(u'option') for option in options: value = option.xpath(u'./@value').extract()[0] if not value: continue prod = Product(product) opt_name = option.xpath( u'normalize-space(./text())').extract()[0] last_pos = opt_name.find('- Out of stock') if last_pos == -1: last_pos = len(opt_name) prod['name'] = prod[ 'name'] + ' ' + label + ' ' + opt_name[: last_pos].strip( ) prod['name'] = prod['name'].strip() prod['identifier'] = value prod['sku'] = value stock = option.xpath('./@class').extract() stock = stock[0] if stock else '' if stock.startswith('out'): prod['stock'] = 0 if prod['identifier'] in self_product_ids: prod['name'] = self_product_ids[ prod['identifier']] else: self_product_ids[ prod['identifier']] = prod['name'] self_matched_identifiers.add(prod['identifier']) yield prod # root options options = response.xpath( u'//select[@id="productOptionDropDown2"]/option[not(@disabled)]' ) for option in options: value = option.xpath(u'./@value').extract()[0] if not value: continue prod = Product(product) opt_name = option.xpath( u'normalize-space(./text())').extract()[0] last_pos = opt_name.find('- Out of stock') if last_pos == -1: last_pos = len(opt_name) prod['name'] = prod[ 'name'] + ' ' + label + ' ' + opt_name[: last_pos].strip( ) prod['name'] = prod['name'].strip() prod['identifier'] = value prod['sku'] = value stock = option.xpath('./@class').extract() stock = stock[0] if stock else '' if stock.startswith('out'): prod['stock'] = 0 if prod['identifier'] in self_product_ids: prod['name'] = self_product_ids[prod['identifier']] else: self_product_ids[prod['identifier']] = prod['name'] self_matched_identifiers.add(prod['identifier']) yield prod else: options = response.xpath( u'//select[@id="productOptionDropDown2"]//option') if options: for option in options: value = option.xpath(u'./@value').extract()[0] if not value: continue prod = Product(product) opt_name = option.xpath( u'normalize-space(./text())').extract()[0] last_pos = opt_name.find('- Out of stock') if last_pos == -1: last_pos = len(opt_name) prod['name'] = prod[ 'name'] + ' ' + opt_name[:last_pos].strip() prod['name'] = prod['name'].strip() prod['identifier'] = value prod['sku'] = value stock = option.xpath('./@class').extract() stock = stock[0] if stock else '' if stock.startswith('out'): prod['stock'] = 0 if prod['identifier'] in self_product_ids: prod['name'] = self_product_ids[ prod['identifier']] else: self_product_ids[ prod['identifier']] = prod['name'] self_matched_identifiers.add(prod['identifier']) yield prod else: options = response.xpath('//input[@name="id"]') for option in options: value = option.xpath(u'./@id').extract() if not value: continue prod = Product(product) prod['name'] = prod['name'] + ' ' + ' '.join( option.xpath( u'./@data-colour|./@data-size').extract()) prod['name'] = prod['name'].strip() prod['identifier'] = value[0].strip() prod['sku'] = value[0].strip() stock = 'in-stock' in option.xpath( '@class').extract()[0] if not stock: prod['stock'] = 0 if prod['identifier'] in self_product_ids: prod['name'] = self_product_ids[ prod['identifier']] else: self_product_ids[ prod['identifier']] = prod['name'] self_matched_identifiers.add(prod['identifier']) yield prod