def parse_shipping_cost(self, response): hxs = HtmlXPathSelector(response) meta = response.meta product = meta['product'] item = product['item'] shipping_cost = hxs.select( '//div[@class="shipment-method" and ' + 'div/div/label/text()="Consegna standard"]' + '//span[@class="amount"]/text()').extract() item['shipping_cost'] = extract_price_eu( shipping_cost[0]) if shipping_cost else 0 yield item remove_regex = ',removeEntryResourceURL:"(.*)",updateItemQuantityResourceURL:"' try: remove_item = re.search( remove_regex, response.body).group(1).split('removeEntryResourceURL:"')[-1] except: return cart_entry = hxs.select('//div[@data-product="' + item['identifier'] + '"]/@data-cartentry').extract() req = FormRequest( remove_item, formdata={'cartentry': cart_entry}, callback=self.parse_sync_shipping, dont_filter=True, meta={'collect_products': meta.get('collect_products')[1:]}) yield req
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta loader = ProductLoader(response=response, item=Product()) identifier = hxs.select('//dd[@itemprop="sku"]/text()').extract() if identifier: identifier = identifier[0] else: identifier = response.url.split('/')[-1] loader.add_value('identifier', identifier) name = hxs.select('//h1[@class="detail__title"]/text()').extract() if not name: name = hxs.select('//h1[@itemprop="name"]/text()').extract() loader.add_value('name', name[0].strip()) price = hxs.select('//img[@class="buybox__pricetag"]/@alt|//*[@itemprop="price"]/text()').extract() if price: price = price[0] else: price = '0' sku = meta.get('sku') if sku: loader.add_value('sku', meta['sku']) loader.add_value('price', extract_price_eu(price)) loader.add_value('url', response.url) loader.add_xpath('image_url', '//div[contains(@class, "product-images")]/img/@src|//img[@itemprop="image"]/@src', lambda imgs: urljoin_rfc(base_url, imgs[0])) yield loader.load_item()
def parse_products(self, response): category = response.css('.breadcrumbs').xpath( './/a/text()').extract()[1:] products = response.css('.listing_item') for product in products: loader = ProductLoader(item=Product(), selector=product) image_url = product.css('.listing_item_image').xpath( 'img/@src').extract_first() if not 'noimage' in image_url: loader.add_value('image_url', image_url) url = product.css('.listing_item_name').xpath( '@href').extract_first() url = url_query_cleaner(response.urljoin(url)) sku = url.split('/')[-1] loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', url) loader.add_xpath('name', './/a[@class="listing_item_name"]/text()') loader.add_xpath( 'price', './/span[@class="listing_item_basic_price"]/text()') loader.add_value('category', category) shipping_cost = product.css('.listing_item_delivery_costs').xpath( 'text()').extract_first() loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) if 'Non disponibile' in product.css( '.listing_item_availability').xpath( 'text()').extract_first(): loader.add_value('stock', 0) item = loader.load_item() dealer = product.css('.listing_item_merchant_name').xpath( 'img/@alt').extract_first() item['metadata'] = {'Dealer': dealer} yield item
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_xpath('name', '//div[@id="sheetBoxTopDetails"]//h1/span/text()') loader.add_value('url', response.url) price = hxs.select('//h3/span[@id="md_price"]/text()').extract() if price == []: price = 0 else: price = extract_price_eu(price[0]) loader.add_value('price', price) loader.add_value('shipping_cost', 0) image_url = hxs.select('//img[@id="sheetMainImage"]/@src').extract()[0] loader.add_value('image_url', urljoin(base_url, image_url)) category = hxs.select( '//div[@id="breadcrumbs"]/span[@id="md_category"]/a/text()' ).extract() try: category.remove('Home') except ValueError: pass category = ' > '.join(category) loader.add_value('category', category) loader.add_xpath('brand', '//td[@id="md_brand"]/text()') stock = hxs.select( '//span[@id="md_availability"]/@content').extract()[0] if stock == 'out_of_stock': stock = 0 else: stock = 1 loader.add_value('stock', stock) loader.add_xpath('sku', '//td[@id="md_mpn"]/text()') loader.add_xpath( 'identifier', '//div[@id="sheetBoxTopDetails"]//tr[@class="code"]/td/text()') yield loader.load_item()
def parse_page(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) ''' subcats = hxs.select('//div[@class="child_cat"]/@onclick').re(r'(http.*html)') for url in subcats: yield Request(urljoin_rfc(base_url, url), callback=self.parse_page) ''' pages = hxs.select( '//a[contains(@class, "page_page")]/@href').extract() for url in pages: yield Request(urljoin_rfc(base_url, url), callback=self.parse_page) for z in hxs.select(self.products_xpath): pprice = z.select( './div[@class="prix_sans_promo"]/div[@class="prix_vente_sans_promo"]/text()' ).extract() if not pprice: pprice = z.select( './div[@class="prix"]/div[@class="prix_vente"]/text()' ).extract() if not pprice: self.errors.append('WARNING: No price in %s' % response.url) continue else: price = pprice[0] try: product_url = z.select( './div[@class="title"]/h2/a/@href').extract()[0] except: self.errors.append('WARNING: No url in %s' % response.url) continue loader = ProductLoader(selector=z, item=Product()) loader.add_xpath('identifier', './/div[contains(@id, "im_prod_")]/@id', re=r'im_prod_(\d+)') loader.add_xpath('name', './div[@class="title"]/h2/a/text()') loader.add_value('url', urljoin_rfc(base_url, product_url)) loader.add_value('price', extract_price_eu(price.replace(' ', ''))) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta loader = ProductLoader(response=response, item=Product()) identifier = hxs.select('//a[contains(@class, "btn-large")]/@href' ).extract()[0].split('/')[-1] loader.add_value('identifier', identifier) name = hxs.select( '//div[@class="span24"]/h1/text()').extract()[0].strip() loader.add_value('name', name) loader.add_value('sku', meta['sku']) price = hxs.select('//p[@class="actual-price"]/text()').extract() price = price[0] if price else '0' loader.add_value('price', extract_price_eu(price)) loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@class="main-packart"]/@src') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) identifier = hxs.select('//meta[@itemprop="productID"]/@content').re( 'sku: *(\d+)') name = u' '.join([ x.strip() for x in hxs.select( u'//div[@class="product-name"]//text()').extract() if x.strip() != u'' ]) sku = [x for x in name.split(' ') if x.isdigit() and len(x) > 2] sku = sku[0] if len(set(sku)) == 1 else '' category = hxs.select( u'//div[@class="breadcrumbs"]//li/a/text()').extract() category = category[-1].strip() if category else '' loader.add_value('identifier', identifier) loader.add_value('name', name) brand = hxs.select('//meta[@itemprop="brand"]/@content').extract() brand = brand[0].strip() if brand else '' loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('url', response.url) price = hxs.select(u'//span[@class="price"]/text()').extract() price = extract_price_eu(price[0]) #price = price[0].replace(',', '') if price else '' #if price: #price += hxs.select(u'//span[@class="price"]/sup/text()')[0].extract() loader.add_value('price', price) image = hxs.select( u'//div[contains(@class, "img-box")]//img/@src').extract() image = image[0] if image else '' loader.add_value('image_url', image) yield loader.load_item()
def extract_price(self, price): """ override extract price cause French site has different number format: #.###,## """ return extract_price_eu(price)