def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) price = Decimal(0) price_script = ''.join( hxs.select( "//script[contains(., 'productPrice')]/text()").extract()) if price_script: price = re.findall(r'productPrice":(\d+)', price_script) if price: price = extract_price(price[0]) else: price = ''.join( hxs.select( "//div[@id='product-simple']//span[contains(concat('',@id,''), 'product-price')]//text()" ).extract()) price = ''.join(re.findall('([\d\.,]+)', price)) price = extract_price_eu(price) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//h1/text()") if hxs.select( "//p[@class='availability']/img[@alt='En stock']").extract(): stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@class='breadcrumbs']//li[1< position()]//a/text()") loader.add_xpath( 'brand', "//table[@class='data-table']//tr[contains(., 'Marque')]/td/text()" ) loader.add_value('shipping_cost', "0") sku = ''.join( hxs.select("//input[@type='hidden' and @name='product']/@value"). extract()) loader.add_value('sku', sku.strip()) loader.add_value('identifier', sku) loader.add_xpath('image_url', "//a[@class='MagicZoomPlus']/img/@src") yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) for url in response.xpath( '//p[@id="color_variants"]//a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) size_varians = response.xpath( '//p[@id="size_variants"]//a/@href').extract() for url in size_varians: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) loader = ProductLoader(item=Product(), response=response) price = response.xpath( '//b[contains(@class, "pro-price")]/text()').extract() if not price: return price = extract_price_eu(''.join(price[0].split()).strip()) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') if response.xpath('//span[text()="Obserwuj"]'): stock = '0' else: stock = '1' loader.add_value('stock', stock) loader.add_xpath('category', '//nav[@id="breadcrumbs"]/a[position()>1]/text()') loader.add_xpath('brand', '//*[@id="catalog-info"]//a/b/text()') loader.add_value('shipping_cost', "0") sku = ' ' + response.xpath( '//*[@id="catalog-info"]//b/text()').extract()[-1].strip() loader.add_value('sku', sku.strip()) loader.add_value('identifier', hashlib.md5(sku).hexdigest()) loader.add_xpath('image_url', '//main[@id="content"]//img/@src') product = loader.load_item() if product['identifier'] not in self.identifiers_viewed: self.identifiers_viewed.append(product['identifier']) yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta['product'], selector=hxs) loader.add_xpath('identifier', '//input[@name="anid"]/@value') # loader.add_xpath('sku', '//span[contains(text(),"Herstellernummer:")]/span/text()') loader.add_value('url', response.url) loader.add_value( 'name', u''.join( hxs.select('//h1[@id="test_product_name"]/text()').extract()). strip().replace('\n', ' ')) sku = response.meta.get('sku', '') if sku: loader.add_value('sku', sku) price = hxs.select( '//form/div[@class="price"]/img/@title').extract() price = price[0] if price else '0' loader.add_value('price', extract_price_eu(price)) out_of_stock = hxs.select( '//form/div[@class="price"]/img/@title').extract() if out_of_stock: loader.add_value('stock', '0') else: loader.add_value('stock', '1') loader.add_value('brand', response.meta.get('brand', '')) else: try: loader.add_value( 'sku', re.search(r'\b([A-Z]{1,2})*[\+\-0-9]{2,10}', loader.get_output_value('name')).group(0)) except: pass loader.add_value('brand', 'Logitech') loader.add_value('category', response.url.split('/')[-2]) img = hxs.select('//td[@id="magiczoomplushook"]//a/@href').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('shipping_cost', '0') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_name = ''.join( hxs.select('//*[@id="content"]/h2[1]//text()').extract()).strip() product_name = product_name.replace(u'\xa0', ' ') sku = hxs.select('//li[@class="codice"]/span/text()').extract() sku = sku[0] if sku else '' img = hxs.select('//div[@class="img-big"]//img/@src').extract() category = ''.join( hxs.select('//*[@id="content"]//p[@class="briciola"]//text()'). extract()).strip() category = category.split(' / ')[2:] for option in hxs.select('//ul[@class="buy"]'): loader = ProductLoader(item=Product(), selector=hxs) name = option.select( './li[@class="prezzo"]//text()').extract()[0].replace( ':', '').strip() if name != '': name = ' - ' + name price = option.select( './li[@class="prezzo"]//text()').extract()[1].strip().replace( u'\u20ac', '') price = extract_price_eu(price) * Decimal('1.22') product_identifier = option.select( './li[@class="acquista"]/a/@href').extract()[0] product_identifier = url_query_parameter( urljoin_rfc(get_base_url(response), product_identifier), 'id_opzione') loader.add_value('identifier', product_identifier) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_value('name', product_name + name) loader.add_value('price', price) loader.add_xpath('brand', '//p[@class="desc-prod"]/a/text()') if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_value('category', category) yield loader.load_item()
def parse_cat(self, response): hxs = HtmlXPathSelector(response) for productxs in hxs.select('//div[contains(@class, "box-product")]'): product = Product() product['price'] = extract_price_eu(''.join( productxs.select( './/div[contains(@class,"product-price")]/strong/text()'). extract())) request = Request(urljoin_rfc( get_base_url(response), productxs.select('.//a/@href').extract()[0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, self.add_shipping_cost(product)) for page in hxs.select('//div[@class="paginator"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), page), callback=self.parse_cat)
def parse_cat(self, response): hxs = HtmlXPathSelector(response) for productxs in hxs.select('//ul[@id="product_list"]/li'): product = Product() product['price'] = extract_price_eu(''.join( productxs.select('.//span[@class="price"]//text()').re( r'[\d.,]+'))) request = Request(urljoin_rfc( get_base_url(response), productxs.select( './/a[@class="product_img_link"]/@href').extract()[0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, self.add_shipping_cost(product)) for url in hxs.select( '//ul[contains(@class, "pagination")]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_cat)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select('//div[@id="listaSezioni"]/div/a/@href').extract() for category in categories: cat_url = urljoin_rfc(base_url, category) yield Request(cat_url) sub_categories = hxs.select('//div[@class="contentGruppi"]/div/div[@class="nome"]/a/@href').extract() for sub_category in sub_categories: cat_url = urljoin_rfc(base_url, sub_category) yield Request(cat_url) products = hxs.select('//div[@class="articolo"]') if products: for product in products: l = ProductLoader(item=Product(), selector=product) #l.add_xpath('name', 'h2/a/b/text()') url = product.select('.//h2/a/@href').extract() url = urljoin_rfc(base_url, url[0]) l.add_value('url', url) l.add_value('identifier', re.search('art/(\d+)_', url).group(1)) l.add_xpath('sku', 'p[@class="codfor"]/strong/text()') l.add_xpath('brand', 'p[@class="marca"]/img/@alt') image_url = product.select('div[@class="img"]/a/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' l.add_value('image_url', image_url) category = hxs.select('//div[@class="gruppo"]/text()').extract()[0].strip() l.add_value('category', category) price = product.select('p[@class="prezzo"]/text()').extract() price = extract_price_eu(price[-1]) if price else 0 l.add_value('price', price) if price<=0: l.add_value('stock', 0) item = l.load_item() yield Request(item['url'], callback=self.parse_product, meta={'item': item}) next = hxs.select('//a[@class="next"]').extract() if next: yield Request(urljoin_rfc(base_url, next[-1]))
def parse_product(self, response): def get_sku(name): res = re.findall("([0-9]{3,5}).*", name) if res: return res.pop() else: return "" hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//h1/text()').extract().pop().strip() price = hxs.select('//span[@class="bigPrice"]/text()').extract().pop() sku = get_sku(name) identifier = hxs.select( '//input[@id="surveyObjectId"]/@value').extract().pop() category = hxs.select( '//div[@itemprop="breadcrumb"]/div/a[not(contains(@class, "last"))]/text()' ).extract() brand = "Lego" image_url = hxs.select('//img[@id="imgMain"]/@src').extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value("name", name) loader.add_value("identifier", identifier) loader.add_value("price", extract_price_eu(price)) loader.add_value("url", response.url) loader.add_value("sku", sku) if image_url: loader.add_value("image_url", urljoin_rfc(base_url, image_url.pop())) if category: loader.add_value("category", category.pop()) loader.add_value("stock", 1) loader.add_value("brand", brand) loader.add_value('shipping_cost', 5.75) yield loader.load_item()
def parse(self, response): xxs = XmlXPathSelector(response) for product in xxs.select('//product'): category = product.select('./Category/text()').extract() loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', './product-id/text()') loader.add_xpath('sku', './product-id/text()') loader.add_xpath('url', './product-url/text()') loader.add_xpath('name', './product-name/text()') loader.add_xpath('brand', './brand/text()') loader.add_value('price', extract_price_eu(' '.join(product.select('./price/text()').extract()))) if category: loader.add_value('category', category[0].split('/')[-1].strip()) loader.add_xpath('image_url', './image-url/text()') loader.add_xpath('stock', './stock/text()') if loader.get_output_value('price') > 499: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '25') yield loader.load_item()
def parse_products(self, response): data = json.loads(response.body) hxs = HtmlXPathSelector(text=data['products']) base_url = response.meta.get('base_url') products = hxs.select('//div[contains(@class, "productItem")]') for product in products: product_loader = ProductLoader(item=Product(), selector=hxs) identifier = product.select( './/*[contains(@id, "main_image")]/@id').re(r'(\d+)$') if not identifier: continue product_loader.add_value('identifier', identifier[0]) image_url = product.select( './/img[contains(@id, "main_image")]/@data-src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(base_url, image_url)) product_name = product.select( './/span[contains(@class, "productTitle")]/a/text()').extract( )[0] product_loader.add_value('name', product_name) sku = '' for match in re.finditer(r"([\d,\.]+)", product_name): if len(match.group()) > len(sku): sku = match.group() product_loader.add_value('sku', sku) url = product.select( './/span[contains(@class, "productTitle")]/a/@href').extract( )[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) price = ''.join( product.select('.//span[@class="price"]/text()').re( r'[\d.,]+')) price = extract_price_eu(price) product_loader.add_value('price', price) yield product_loader.load_item()
def parse_cat(self, response): hxs = HtmlXPathSelector(response) for productxs in hxs.select('//li[@class="product"]'): product = Product() product['price'] = extract_price_eu(''.join( productxs.select( './/div/span[@class="price"]//text()').extract())) product['stock'] = '1' request = Request(urljoin_rfc( get_base_url(response), productxs.select('.//h2[@class="title"]/a/@href').extract() [0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, self.add_shipping_cost(product)) for page in hxs.select( '//div[@class="navigation"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), page), callback=self.parse_cat)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) price = ''.join( hxs.select("//span[@id='our_price_display']//text()").extract()) price = ','.join(re.findall('([\d\.,]+)', price)) price = extract_price_eu(price) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//h1//text()") if hxs.select( "//p[@id='stock_statut']//img[contains(./@src, 'stock_in.png')]" ).extract(): # if price: stock = '1' else: stock = '0' loader.add_value('stock', stock) loader.add_xpath( 'category', "//div[@class='breadcrumb']/a[1 < position()]/text()") loader.add_xpath( 'brand', "//div[@id='block_link_manu']//p[contains(., 'Voir tous les produits')]//a/text()" ) loader.add_value('shipping_cost', "0") sku = ''.join( hxs.select("(//h2[@id='product_reference']//text())[2]").extract()) loader.add_value('sku', sku.strip()) script = ''.join( hxs.select( "(//script[contains(., 'id_product')]//text())[1]").extract()) product_id = self.product_id_regex.findall(script) loader.add_value('identifier', ''.join(product_id)) loader.add_xpath('image_url', "//img[@id='bigpic']/@src") yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for cat in response.css('div.product-category a::attr(href)').extract(): yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse) for cat in response.css('a.page-number::attr(href)').extract(): yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse) for productxs in response.css('div.product'): product = Product() price = productxs.css('span.amount::text').extract_first() if not price: continue product['price'] = extract_price_eu(price) if productxs.select('.//div[contains(@class, "out-of-stock-label")]'): product['stock'] = 0 else: product['stock'] = 1 request = Request(urljoin_rfc(get_base_url(response), productxs.select('.//a/@href').extract()[0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, self.add_shipping_cost(product))
def parse(self, response): hxs = HtmlXPathSelector(response) for cat in hxs.select('//ul[@class="menu"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse) for cat in hxs.select('//p[@class="pagination"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse) for productxs in hxs.select( '//div[@id="search_results_products"]/div[starts-with(@id, "product_")]' ): price = extract_price_eu( productxs.select( './/div[contains(@class,"product_price") and @class!="product_price_percentage_saved"]//span[@class="inc"]/span[@class="SEK"]/text()' ).extract()[-1]) yield Request(urljoin_rfc( get_base_url(response), productxs.select('.//a/@href').extract()[0]), callback=self.parse_product, meta={'price': price})
def parse_product(self, response): hxs = HtmlXPathSelector(response) identifier = hxs.select( '//input[@name="product_id"]/@value').extract()[0] sku = hxs.select( '//div[@class="texte_zoom"]/div/div/h2/text()').extract()[0] category = hxs.select( '//div[@class="breadParent"]/ol/li/a/span/text()').extract()[1:] name = hxs.select('//h1/span/text()').extract()[0].strip() brand = hxs.select( '//div[@class="texte_zoom"]/div/div/a/img/@alt').extract() price = "".join( hxs.select( '//div[@id="prixZoom"]//div[@class="ttc"]/span[@itemprop="price"]/span/text()' ).extract()).strip().replace(' ', '') image_url = hxs.select( '//div[@class="photos"]//img[@itemprop="image"]/@src').extract() stock = hxs.select( '//div[contains(@name,"dispodiv")]/span[contains(text(),"En stock")]' ) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('category', category) if brand: l.add_value('brand', brand) l.add_value('sku', sku) l.add_value('url', response.url) l.add_value('price', extract_price_eu(price)) # if not stock: l.add_value('stock', 1) if image_url: l.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0].strip())) yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for productxs in hxs.select('//div[@class="artbox"]'): product = Product() product['price'] = extract_price_eu(''.join( productxs.select('.//span[@class="price"]//text()').extract())) if product['price'] == 0: product['stock'] = '0' else: product['stock'] = '1' request = Request(urljoin_rfc( get_base_url(response), productxs.select('.//a[@class="title"]/@href').extract()[0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, product) for page in hxs.select( '//div[@class="paging"]//a[@class="navi more"]/@href').extract( ): yield Request(urljoin_rfc(get_base_url(response), page)) break # First link only
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) name = hxs.select('//h1//text()').extract() loader.add_value('name', name) loader.add_value('url', response.url) price = hxs.select('//span[@class="price"]/text()').extract()[0] price = price.split()[1] price = extract_price_eu(price) loader.add_value('price', price) loader.add_value('shipping_cost', 0) image_url = hxs.select('//img/@src').extract()[1] loader.add_value('image_url', urljoin(base_url, image_url)) category = hxs.select( '//div[@class="breadcrumbs"]//a/text()').extract()[1] loader.add_value('category', category.strip()) brand = hxs.select( '//td[text()="Produttore"]/following-sibling::td[1]/a/text()' ).extract()[0].strip() loader.add_value('brand', brand) loader.add_value('stock', 1) sku = hxs.select( '//div[@id="product-single"]//table//tr[2]/td[2]/text()').extract( ) loader.add_xpath( 'sku', '//td[text()="Codice prodotto"]/following-sibling::td[1]/text()') loader.add_xpath( 'identifier', '//td[text()="Codice prodotto"]/following-sibling::td[1]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) # Options options = hxs.select('//h3[contains(text(), "Warianty produktu")]/..//h2[@class="producttitlesimple"]/a/@href').extract() for url in options: yield Request(urlparse.urljoin(get_base_url(response), url), callback=self.parse_product) loader = ProductLoader(item=Product(), response=response) price = ''.join(hxs.select('//span[@itemprop="price"]/text()').extract()).replace(' ', '') price = extract_price_eu(price) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[contains(@class, "productbittitle")]/text()') if hxs.select('//div[@class="clearfix hidden-xs"]/a[@class="avail"]'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') categories = hxs.select('//ol[contains(@class, "breadcrumb")]/a/@title').extract()[1:-1] loader.add_value('category', categories) loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') if price < Decimal(1000): shipping_cost = '15' else: shipping_cost = '0' loader.add_value('shipping_cost', shipping_cost) sku = ''.join(hxs.select('//p[@class="productcode"]/strong/text()').extract()) if not sku: sku = re.findall(re.compile("\/(\d*.)$"), response.url) sku = sku[0] if sku else '' loader.add_value('sku', sku.strip()) loader.add_value('identifier', response.url.split("/")[-1]) loader.add_xpath('image_url', "//div[@id='main-photo']//img/@src") yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for productxs in hxs.select('//div[@class="single_article"]'): product = Product() product['price'] = extract_price_eu(''.join( productxs.select('.//div[@class="price"]/img/@alt').extract())) if productxs.select( './/div[@class="status"]/img[contains(@pagespeed_url_hash,"2593193988")]' ): product['stock'] = '0' else: product['stock'] = '1' request = Request(urljoin_rfc( get_base_url(response), productxs.select('substring-before(./a/@href,"?")').extract() [0]), callback=self.parse_product, meta=response.meta) yield self.fetch_product(request, product) for page in hxs.select('//div[@class="paging"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), page))
def parse_product(self, response): loader = ProductLoader(item=response.meta.get('product', Product()), response=response) loader.add_xpath('identifier', '//input[@name="id"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1//text()') sku = response.xpath( '//div[@class="basic-content-body"]//dt[contains(text(), "Artikelnummer")]' '/following-sibling::dd/text()').re(r'(\d{3}\d*)') if sku: loader.add_value('sku', sku) else: self.log('No SKU for %s' % (response.url)) loader.add_xpath( 'category', '//ul[contains(@class, "breadcrumbs")]/li[position()=last()-1]/a/text()' ) img = response.xpath('//img[@itemprop="image"]/@src').extract() if img: loader.add_value('image_url', response.urljoin(img[0])) price = ''.join( response.xpath('normalize-space(//*[@itemprop="price"]/text())'). re(r'([\d.,]+)')) loader.add_value('price', extract_price_eu(price)) loader.add_value('brand', 'Lego') in_stock = bool( response.xpath( '//div[@class="product-info"]//em[@class="mod-success"]//text()' ).re(r'lager')) if not in_stock: loader.add_value('stock', 0) yield self.add_shipping_cost(loader.load_item())
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//input[@name="product"]/@value') if not loader.get_output_value('identifier'): loader.add_xpath( 'identifier', 'substring-after(//span[starts-with(@id,"product-price-")]/@id, "product-price-")' ) loader.add_xpath( 'sku', '//tr/th[contains(text(),"Artikelnummer")]/../td/text()') loader.add_value('url', response.url) loader.add_xpath('name', '//div[@itemprop="name"]//text()') loader.add_xpath('image_url', '//meta[@itemprop="image"]/@content') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') shipping_cost = hxs.select( '//th[contains(text(),"Standardlieferpreis")]//following-sibling::td/span[@class="price"]/text()' ).extract() if shipping_cost: loader.add_value('shipping_cost', extract_price_eu(shipping_cost[0])) if not loader.get_output_value('name'): return if loader.get_output_value('name').split()[0] == '2': loader.add_value('brand', 'Flynn') else: loader.add_value('brand', loader.get_output_value('name').split()[0]) if hxs.select( '//span[@itemprop="availability" and @content="in_stock"]'): loader.add_value('stock', '1') else: loader.add_value('stock', '0') sku = loader.get_output_value('sku') sku = sku.upper().strip() if sku else '' made_product = self.made_products.get(sku, None) no_category = False if made_product: loader.add_value('category', made_product['Category']) else: loader.add_xpath( 'category', '//div[@class="breadcrumbs"]/ul/li[position()>1]/a/span/text()' ) if not loader.get_output_value('category'): loader.add_value('category', (x.replace('-', ' ') for x in response.url.split('/')[3:-1])) no_category = True product = loader.load_item() catmap = { "bedding and bath": "Bed & Bath", "beds": "Beds", "chairs": "Chairs", "homewares accessories": "Home Accessories", "lighting": "Lighting", "sofas and armchairs": "Sofas", "storage": "Storage", "tables": "Tables", } product['category'] = catmap.get(product['category'], product['category']) metadata = MadeMeta() metadata['johnlewis_code'] = made_product[ 'JL product code'] if made_product else '' metadata['next_code'] = made_product[ 'Next product code'] if made_product else '' product['metadata'] = metadata trs = hxs.select( '//table[@id="super-product-table"]//tr/td[@class="price"]/..') if not trs: for x in self.yield_product(product, no_category): yield x return for tr in trs: loader = ProductLoader(item=Product(product), selector=tr) loader.add_xpath( 'identifier', 'substring-after(.//span[starts-with(@id,"product-price-")]/@id, "product-price-")' ) loader.add_value('name', product['name']) loader.add_xpath('name', './/td[1]/text()') loader.add_xpath('price', './/span[@property="price"]/@content') for x in self.yield_product(loader.load_item(), no_category): yield x
def extract_product_info(self, product): prod_url = product.xpath('.//a/@href').extract()[0] price_section = product.xpath('./../../following-sibling::div') marketplace = len(price_section.xpath('.//*[@class="seller"]')) > 0 is_used = False if marketplace: state = product.xpath('../..//*[@class="shipping"]/ul/li[span[contains(text(), "Estado")]]/strong/text()').extract_first() if state and 'novo' not in state.lower(): is_used = True price = price_section.xpath('.//*[@class="floatl"]//*[@class="userPrice"]/text()').extract() if price: price = price[0].replace(u'\xa0', '') price = extract_price_eu(price) promotion_price = price_section.xpath('.//*[@class="floatl"]//*[@class="oldPrice"]/text()').extract() if promotion_price: promotion_price = promotion_price[0].replace(u'\xa0', '') promotion_price = extract_price_eu(promotion_price) offers_url = None offers_count = 0 offers_links = price_section.css('.OffersSumary').xpath(".//a") for a in offers_links: link_title = a.xpath("text()").extract_first().strip() if u'segunda' in link_title: continue if u'novo' not in link_title: continue offers_url = a.xpath("@href").extract_first() offers_count = a.xpath("text()").re_first(u"(?u)(\d*)\s*novo") offers_count = int(offers_count) seller = None seller_identifier = None if marketplace: seller = price_section.xpath('.//a[@class="seller"]/text()').extract_first().strip() seller_url = price_section.xpath('.//a[@class="seller"]/@href').extract_first() seller_identifier = get_seller_id_from_url(seller_url) self.seller_ids[seller.lower()] = seller_identifier out_stock = len(product.xpath('./..//*[@class="Nodispo"]').extract()) > 0 dispo = product.xpath('./../..').css('.sellerInfos > li') if dispo: dispo = ' '.join(dispo.css('.Dispo-txt').xpath("text()").extract()) else: dispo = '' exclusive_online = u'exclusivo internet' in dispo.lower() if marketplace: shipping = product.xpath('../..//*[@class="shipping"]/ul/li[span[contains(text(), "Portes")]]/strong/text()') else: shipping = product.xpath('./..//*[@class="Delivery-price"]//text()') if shipping: shipping = ''.join(shipping.extract()) shipping = extract_price_eu(shipping) else: shipping = '' identifier = re.search('/mp(\d+)/', prod_url) if not identifier: identifier = re.search('/a(\d+)$', prod_url) if identifier: identifier = 'fcom' + identifier.groups()[0] else: self.log('Identifier not found {}'.format(prod_url)) if marketplace: combined_identifier = identifier + '-' + seller_identifier else: combined_identifier = identifier combined_identifier = self.get_identifier(combined_identifier) result = {'url': prod_url, 'marketplace': marketplace, 'price': price, 'promotion_price': promotion_price, 'out_stock': out_stock, 'exclusive_online': exclusive_online, 'shipping': shipping, 'identifier': identifier, 'offers_url': offers_url, 'offers_count': offers_count, 'seller': seller, 'seller_identifier': seller_identifier, 'combined_identifier': combined_identifier, 'is_used': is_used} return result
def parse_product(self, response): try: brand_name = response.xpath( '//span[@class="manufacturer"]/text()').extract()[0] name = response.xpath( '//div[@id="product-box"]//div[@class="title"]/text()' ).extract()[0].strip() except: self.log('No brand or name found: %s' % response.url) return if response.xpath( '//div[@class="no-valid-variants" and contains(text(), "this item is currently not available")]' ): return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', response.url) product_loader.add_value('name', brand_name + ' ' + name) sku = response.xpath( '////div[@class="additional-product-no"]/@data-xencoded').extract( ) if sku: sku = sku[0] h = HTMLParser.HTMLParser() key, data = sku.split(':', 1) key = int(key) data = h.unescape(data) # XOR decoding data = [ord(c) ^ key for c in data] data = ''.join([chr(c) for c in data]) sku = re.search('Manufacturer Item no\. (.*)', data) if sku: sku = sku.group(1) # 'Hersteller Artikelnr: 20050/20051' product_loader.add_value('sku', sku) # product_loader.add_xpath('sku', u'//div[@class="additional-product-no" and contains(text(), "Manufacturer Item no.")]', re=r'Manufacturer Item no\. (.*)') identifier = response.xpath( '//input[@name="vw_id"]/@value').extract()[0] product_loader.add_value('identifier', identifier) price = response.xpath( '//div[@class="current-price"]/span[@class="price"]/text()' ).extract() if not price: price = response.xpath( '//table[@class="product-price"]//tr[@class="price"]/td/text()' ).extract() if price: price = price[0] product_loader.add_value('price', extract_price_eu(price)) else: self.log('No product price found: %s' % response.url) return category = response.css('.uk-breadcrumb a::text').extract()[-1] product_loader.add_value('category', category) product_loader.add_value('brand', brand_name.strip()) try: image_url = response.urljoin( response.xpath('//img[@itemprop="image"]/@src').extract()[0]) product_loader.add_value('image_url', image_url) except: pass product = product_loader.load_item() rrp = extract_price_eu(''.join( response.xpath('//span[@class="retail-value"]/text()').extract())) rrp = str(rrp) if rrp > extract_price_eu(price) else '' options = response.xpath( '//div[contains(@id,"artikel_element_prices")]') if options: for opt in options: p = Product(product) optname = opt.xpath( './/meta[@itemprop="name"]/@content').extract()[0] p['name'] = optname p['price'] = extract_price( opt.xpath('.//meta[@itemprop="price"]/@content').extract() [0]) p['identifier'] = p['identifier'] + '-' + opt.xpath('@id').re( 'artikel_element_prices(.*)')[0] if p['identifier'] not in self.identifiers: self.identifiers.append(p['identifier']) yield p else: if product['identifier'] not in self.identifiers: self.identifiers.append(product['identifier']) yield product
def parse_product(self, response): base_url = get_base_url(response) name = response.xpath( '//span[contains(@id, "uxProductName")]/text()').extract()[0] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_xpath( 'price', '//div[contains(@class, "product-view__total-price")]/@data-price') image_url = response.xpath( '//img[contains(@id, "uxProductImage")]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) loader.add_xpath( 'brand', '//tr[td[contains(text(), "Produsent")]]/td[not(contains(text(), "Produsent"))]/text()' ) category = response.xpath( '//tr[td[contains(text(), "Linsetype")]]/td[not(contains(text(), "Linsetype"))]/text()' ).extract() loader.add_value('category', category) loader.add_value('url', response.url) identifier = re.findall("return '(\d+)';", response.body) if not identifier: identifier = re.findall("var productId = (\d+);", response.body) identifier = identifier[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) item = loader.load_item() options = response.xpath( '//span[@class="HeaderMinPrices"]/text()').extract() options = options[0].split(' / ') if options else [] if options and len(options) > 1: for option in options: option_item = deepcopy(item) name = re.findall('(.*) linser per', option)[0] option_item['name'] += ' ' + name option_item['identifier'] += '-' + ''.join(name.split()) option_item['sku'] = option_item['identifier'] price = re.findall('kr (.*)', option) option_item['price'] = extract_price_eu(price[0]) yield option_item else: price = response.xpath( '//span[@class="HeaderMinPrices"]/text()').extract() if not price: price = response.xpath( '//div[@class="DescriptionExtraAccessories"]//span[contains(text(), "Kr")]/text()' ).extract() if not price: price = response.xpath( '//div[@class="DescriptionExtra"]//span[contains(text(), "Kr") or contains(text(), "kr")]/text()' ).extract() price = price[0].lower() price = re.findall('\d+', price.split('kr')[-1])[0] item['price'] = extract_price_eu(price) yield item
def extract_price(self, price): """ override extract price cause French site has different number format: #.###,## """ return extract_price_eu(price)
def parse_product(self, response): if not response.xpath('//div[@id="product"]'): return for url in response.xpath('//ul[@class="options-types"]//a/@href').extract(): yield Request(response.urljoin(url), callback=self.parse_product) loader = ProductLoader(item=Product(), response=response) xpath = '//script/text()' pattern = "tc_vars\['%s'\] = '(.+)'" loader.add_xpath('identifier', xpath, re=pattern % 'product_id') loader.add_xpath('sku', xpath, re=pattern % 'product_id') loader.add_xpath('name', xpath, re=pattern % 'product_name') image_url = response.xpath(xpath).re(pattern % 'product_url_picture') if image_url: image_url = response.urljoin(image_url[0]) loader.add_value('image_url', image_url) loader.add_xpath('url', xpath, re=pattern % 'product_url_page') loader.add_xpath('price', xpath, re=pattern % 'product_totalprice_ati') categories = response.xpath('//nav[@class="breadcrumb"]//span/text()').extract() loader.add_value('category', categories[1:-1]) loader.add_value('stock', int(response.xpath(xpath).re(pattern % 'product_instock')[0] == 'Y')) product = loader.load_item() opt = True options = [] for group in response.xpath('//div[@class="options-list"]/form/div'): for ul in group.select('./ul'): options.append(ul.select('./li')) try: product['name'] += ' ' + ul.select('.//input[@checked]/../label/text()').extract()[0] except: opt = False if group.select('./div'): if group.select('./div/ul'): options.append(group.select('.//li')) product['name'] += ' ' + group.select('.//input[@checked]/../label/text()').extract()[0] elif group.select('./div/select'): option = {'name':group.select('./h2/text()').re('\d*\.(.+)')[0].strip()} option['url_name'] = group.select('.//select/@name').extract()[0] option['price'] = extract_price_eu(group.select('.//label/text()').extract()[0]) opts = [] for opt in group.select('.//select/option'): d = option.copy() d['selector'] = opt opts.append(d) options.append(opts) product['name'] += ' ' + group.select('./h2/text()').re('\d*\.(.+)')[0].strip() product['name'] += ' ' + group.select('.//option[@selected]/text()').extract()[0] if opt: yield product struct_id = response.xpath('//form[@id="formCombinationOptions"]/input[1]/@value').extract() if not struct_id: return struct_id = struct_id[0] struct_name = response.xpath('//form[@id="formCombinationOptions"]/input[1]/@name').extract()[0] url_pattern = 'http://www.maisonsdumonde.com/FR/fr/%s/productCombinationUpdate?%s=%s' %(struct_id, struct_name, struct_id) variants = itertools.product(*options) for variant in variants: item = Product(product) url = url_pattern head_id = '' for option in variant: if not type(option) is dict: if option.select('.//@data-headref'): head_id = option.select('.//@data-headref').extract()[0] url += '&' + option.select('.//@name').extract()[0] + '=' + option.select('.//@value').extract()[0] item['name'] += ' ' + option.select('.//label/text()').extract()[0] item['identifier'] += '-' + option.select('.//input/@value').extract()[0] price = option.select('.//span[@class="price"]/text()[preceding-sibling::br]').extract() if price: item['price'] += extract_price_eu(price[0]) else: url += '&' + option['url_name'] + '=' + option['selector'].select('.//@value').extract()[0] item['name'] += ' ' + option['name'] + ' ' + option['selector'].select('./text()').extract()[0] quantity = option['selector'].select('./@value').extract()[0] item['identifier'] += '-' + quantity item['price'] += option['price'] * int(quantity) if head_id: url += '&combinationProduct[head]=%s' %head_id yield Request(url, callback=self.parse_product)
def _get_price(self, hxs): price = hxs.select('//*[@itemprop="price"]/text()').extract() if price: return extract_price_eu(price[0]) return Decimal('0.0')
def parse_product(self, response): hxs = HtmlXPathSelector(response) if 'Egenskap2' in response.body: self.log('SECOND!!!!!!!!') has_options = hxs.select('//*[@id="OrderFalt"]//select[@name="Egenskap1"]') img = hxs.select('//img[@itemprop="image"]/@src').extract() category = hxs.select('//*[@id="breadcrumb"]//a/text()').extract()[1:] brand = hxs.select('//*[@id="VarumarkeFalt"]/a/img/@alt').extract() brand = brand[0] if brand else '' if has_options: for match in re.finditer(r"(?sim)Vektor_Rubrikartikel\[\d+\] = '(.*?)';", response.body_as_unicode()): loader = ProductLoader(item=Product(), selector=hxs) option = match.group(1) option = option.split('!div!') name = option[2] product_identifier = option[4] match = re.search(r'<span class="PrisREA">(\d+)<span>', option[1], re.DOTALL | re.IGNORECASE | re.MULTILINE) if match: result = match.group(1) else: match = re.search(r'<span class="PrisBOLD">(\d+)<span>', option[1], re.DOTALL | re.IGNORECASE | re.MULTILINE) if match: result = match.group(1) else: self.log('ERROR!!!! NO price!') result = '0' price = extract_price_eu(result) stock = option[6] if 'Midlertidig utsolgt' in stock: loader.add_value('stock', 0) loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img.pop())) loader.add_value('category', category) loader.add_value('brand', brand) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) product_identifier = hxs.select('//*[@id="ArtnrFalt"]/text()').extract()[0] loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//*[@id="ArtikelnamnFalt"]/text()') price = ''.join(hxs.select('//*[@id="PrisFalt"]/meta[@itemprop="price"]/@content').extract()) price = extract_price_eu(price) loader.add_value('price', price) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img.pop())) loader.add_value('category', category) loader.add_value('brand', brand) stock = hxs.select('//*[@id="LevtidFaltMeta"]/@content').extract()[0].strip() if stock == 'Midlertidig utsolgt': loader.add_value('stock', 0) yield loader.load_item()
def parse_offers_static_page(self, response): rows = response.css('#colsMP tr') if rows: rows = rows[1:] exclusive_online = False if response.meta.get('exclusive_online'): exclusive_online = True product_info = response.meta['product_info'] base_identifier = product_info['base_identifier'].replace('mp', '') if not 'fcom' in base_identifier: base_identifier = 'fcom' + base_identifier self.seen.add(base_identifier.replace('fcom', '')) product_info = response.meta.get('product_info') for row in rows: if row.css('.fnacView'): self.log('Skipping Fnac direct product') continue status = row.css('td.gras').xpath('./text()').extract() if status and 'novo' not in status[0].lower(): self.log('Skipping used product') continue price = row.css('.userPrice').xpath('./text()').extract() if not price: self.log('Price not found') continue else: price = price[0].replace(u'\xa0', '').strip() promotion_price = row.css('.oldPrice').xpath('./text()').extract() if promotion_price: promotion_price = extract_price_eu(promotion_price[0].replace(u'\xa0', '').strip()) shipping_cost = row.css('.noir').xpath('./text()').extract() if shipping_cost: shipping_cost = extract_price_eu(shipping_cost[0].strip()) dealer = row.css('.bleu_MP') if not dealer: self.log('Dealer not found') continue dealer_id = dealer.xpath('./a/@href').extract()[0].split('/')[-1] dealer_name = dealer.xpath('./a/strong/text()').extract()[0].strip() loader = ProductLoader(item=Product(), selector=row) identifier = base_identifier + '-' + dealer_id identifier = self.get_identifier(identifier) loader.add_value('identifier', identifier) loader.add_value('dealer', dealer_name) for c in ['name', 'category', 'brand', 'url', 'image_url', 'sku']: loader.add_value(c, product_info[c]) loader.add_value('price', price) if shipping_cost: loader.add_value('shipping_cost', shipping_cost) product = loader.load_item() metadata = SonaeMeta() if exclusive_online: metadata['exclusive_online'] = 'Yes' metadata['delivery_24_48'] = 'Yes' if promotion_price: metadata['promotion_price'] = str(promotion_price) product['metadata'] = metadata if identifier in self.metadata_: prev_meta = self.metadata_[identifier] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: product['metadata']['promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' else: if promo_start: product['metadata']['promo_start'] = promo_start product['metadata']['promo_end'] = today if not promo_end else promo_end yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_found = hxs.select('//div[@id="primary_block"]') if not product_found: return product_id = hxs.select('//input[@name="id_product"]/@value').extract()[0] name = hxs.select('//div[@id="dfCenter"]//h1/text()').extract()[0] category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()[1:] image_url = hxs.select('//img[@id="bigpic"]/@src').extract() if image_url: image_url = image_url[0] product_url = response.url product_brand = hxs.select('//div[@id="short_description_content"]//p[1]//text()').extract()[0] product_brand = product_brand.replace(' di ', ' da ') product_brand = product_brand.replace(' by ', ' da ') try: if len(product_brand) > 20: product_brand = re.search(' da.+?[,.]', product_brand).group(0) except: pass product_brand = product_brand.split(' da ')[-1] product_brand = product_brand.strip().strip('.,') if len(product_brand) > 20: title = hxs.select('//title/text()').extract()[0] s = SequenceMatcher(a=product_brand.title(), b=title.title()) m = s.find_longest_match(0, len(s.a), 1, len(s.b)) product_brand = s.a[m[0]:m[0]+m[-1]].strip() if len(product_brand) < 7 or ' ' not in product_brand: product_brand = None currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body) if currencyRate: currencyRate = Decimal(currencyRate.group(1)) else: currencyRate = 1 taxRate = re.search("var taxRate\D+([\d\.]+)", response.body) if taxRate: taxRate = Decimal(taxRate.group(1)) else: taxRate = 0 reduction_percent = re.search("var reduction_percent\D+([\d\.]+)", response.body) if reduction_percent: reduction_percent = Decimal(reduction_percent.group(1)) else: reduction_percent = 0 reduction_price = re.search("var reduction_price\D+([\d\.]+)", response.body) if reduction_price: reduction_price = Decimal(reduction_price.group(1)) else: reduction_price = 0 productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body) if productPriceTaxExcluded: productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1)) else: productPriceTaxExcluded = 0 idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body) if idDefaultImage: idDefaultImage = idDefaultImage.group(1) if re.search('addCombination.*?;', response.body): # here we parse option tags for more product options. option_value_xpath = '//div[@id="attributes"]//select/option/@value' option_values = hxs.select(option_value_xpath).extract() option_text_xpath = '//div[@id="attributes"]//select/option//text()' option_texts = hxs.select(option_text_xpath).extract() # build the lookup table. options = {} for i in range(len(option_values)): options[option_values[i]] = option_texts[i] # addCombination(5631, new Array('259'), 11, 109.99, 0, -1, 'GGT3050', 0.00, 1); for x in re.finditer('addCombination\((.*?)\);', response.body): s = x.group(0).split(',') offset = Decimal(s[-6]) # determining place of options keys option_key_start = 1 option_key_end = len(s) - 7 # parsing option keys option_texts = [] opt = '' for i in range(option_key_start, option_key_end): try: opt = re.sub('[^\d]+', '', s[i]) option_text = options[opt] except: pass if len(option_text) > 0: option_texts.append(option_text.strip()) price = productPriceTaxExcluded + offset * currencyRate tax = (taxRate / Decimal('100')) + 1 price = price * tax reduction = Decimal('0') if reduction_price or reduction_percent: reduction = price * (reduction_percent / Decimal('100')) + reduction_price price = price - reduction price = round(price, 2) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name + ' ' + ' '.join(option_texts)) image_id = s[-4].strip(" '") if image_url and image_id != "-1" and image_id != idDefaultImage: loader.add_value('image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-')) else: loader.add_value('image_url', image_url) loader.add_value('brand', product_brand) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('identifier', '%s-%s' % (product_id, re.search(r'(\d+)', s[0]).group(1))) loader.add_value('sku', s[-3].strip("' ").decode('utf8')) yield loader.load_item() else: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_xpath('price', '//*[@id="our_price_display"]/text()', lambda x: extract_price_eu(x[0]) if x else Decimal('0')) loader.add_value('category', category) loader.add_value('identifier', product_id) loader.add_xpath('sku', '//*[@id="product_reference"]/span/text()') loader.add_value('brand', product_brand) yield loader.load_item()