def parse_products(self, response): next_page = response.xpath('//div[@class="pages"]//a[@class="next"]/@href').extract() if next_page: yield Request(response.urljoin(next_page[0]), meta=response.meta, callback=self.parse_products) products = response.xpath('//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]') for product_xs in products: name = product_xs.xpath('div[@class="product-info"]/h2[@class="product-name"]/a/text()').extract()[0] url = product_xs.xpath('div[@class="product-info"]/h2[@class="product-name"]/a/@href').extract()[0] identifier = product_xs.xpath('div[@class="product-info"]/div[@class="actions"]//*[contains(@id, "product-price-")]/@id').re(r'(\d+)')[0] price = ''.join(product_xs.xpath('div[@class="product-info"]/div[@class="actions"]//*[contains(@id, "product-price-")]/span/text()').re(r'[\d\.,]+')) brand = product_xs.xpath('div[@class="product-info"]/p[@class="product-brand"]/img/@title').extract() image_url = product_xs.xpath('.//img[contains(@id, "product-collection-image-")]/@src').extract() out_stock = bool(product_xs.xpath('.//i[contains(@class, "icon-stock-outs")]').extract()) try: sku = product_xs.xpath('div[@class="product-info"]/div[@class="product-sku"]/text()').extract()[0].strip() except: sku = '0' l = ProductLoader(item=Product(), response=response) if image_url: l.add_value('image_url', response.urljoin(image_url[0])) l.add_value('url', url) l.add_value('name', name) l.add_value('identifier', identifier) l.add_value('price', extract_price_eu(price)) l.add_value('brand', brand) if sku != '0': l.add_value('sku', sku) if out_stock: l.add_value('stock', 0) product = l.load_item() product['metadata'] = SonaeMeta() if identifier in self.products_meta: prev_meta = self.products_meta[identifier] if prev_meta['sku']: product['sku'] = prev_meta['sku'] product['category'] = prev_meta['category'] else: prev_meta = {} promo = response.meta.get('promo', False) promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: product['metadata']['promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' elif promo_start: product['metadata']['promo_start'] = promo_start product['metadata']['promo_end'] = today if not promo_end else promo_end self._update_product_meta(product) self.collected_ids.add(product['identifier']) yield product
def extract_product(self, hxs): loader = ProductLoader(Product(), selector=hxs) url = hxs.xpath( './/a[@class="product_img_link"]/@href').extract()[0].split('?')[0] identifier = url.split('/')[3].split('-')[0] if identifier not in self.products: return price = hxs.xpath('.//span[@itemprop="price"]/text()').extract_first( '0') price = price.replace(' ', '').replace(',', '.') if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = hxs.xpath('.//span[@class="promo-box"]') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata = SonaeMeta() metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end loader.add_xpath('name', './/span[@class="list-name"]/text()') loader.add_value('identifier', identifier) loader.add_value('url', url) loader.add_value('price', price) sku = url.split('-')[-1].replace('.html', '') try: i_sku = int(sku) if len(str(sku)) > 10: sku = str(sku) else: sku = '' except ValueError: sku = '' loader.add_value('sku', sku) loader.add_xpath('image_url', './/a[@class="product_img_link"]/img/@src') stock = hxs.xpath( './/span[@class="avail-label"]/text()').extract_first() if not stock: loader.add_value('stock', 0) loader.add_value('brand', self.products[identifier]['brand']) loader.add_value('category', self.products[identifier]['category']) item = loader.load_item() item['metadata'] = metadata return item
def parse_product(self, response): name = response.xpath('//div[@class="product-name"]/h1/text()').extract()[0] url = response.url identifier = re.findall(r'/id/(\d+)', response.url)[0] price = ''.join(re.findall(r'[\d\.,]+', response.xpath('//div[@class="product-essential"]//div[contains(@class, ' '"add-to-cart-wrapper")]//*[contains(@id, "product-price-")]/span/text()') .extract()[0])) brand = response.xpath('.//div[@class="product-brand"]//img/@title').extract() image_url = response.xpath('//img[@id="image-main"]/@src').extract() out_stock = bool(response.xpath('.//span[@class="shipping-run"]/i[contains(@class, "icon-stock-outs")]').extract()) try: sku = response.xpath('//div[@class="product-sku"]//text()').extract()[0].strip() except: sku = '' l = ProductLoader(item=Product(), response=response) if image_url: l.add_value('image_url', response.urljoin(image_url[0])) l.add_value('url', url) l.add_value('name', name) l.add_value('identifier', identifier) l.add_value('price', extract_price_eu(price)) l.add_value('brand', brand) if sku: l.add_value('sku', sku) if out_stock: l.add_value('stock', 0) product = l.load_item() product['metadata'] = SonaeMeta() if identifier in self.products_meta: prev_meta = self.products_meta[identifier] if prev_meta['sku']: product['sku'] = prev_meta['sku'] product['category'] = prev_meta['category'] else: prev_meta = {} promo = response.meta.get('promo', False) promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: product['metadata']['promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' elif promo_start: product['metadata']['promo_start'] = promo_start product['metadata']['promo_end'] = today if not promo_end else promo_end self._update_product_meta(product) self.collected_ids.add(product['identifier']) yield product
def get_product_from_cache(self, response, product_data): identifier = product_data['identifier'] values = self.products[identifier] loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('brand', values['brand'].decode('utf-8')) loader.add_value('sku', values['sku'].decode('utf-8')) loader.add_value('image_url', values['image_url']) loader.add_value('name', values['name']) category = self.products[identifier]['category'].split(' > ') loader.add_value('category', category) loader.add_value('dealer', 'Fnac') if product_data['shipping']: loader.add_value('shipping_cost', product_data['shipping']) loader.add_value('url', product_data['url']) loader.add_value('price', str(product_data['price']).replace('.', ',')) product = Product(loader.load_item()) product['metadata'] = SonaeMeta() product['metadata']['delivery_24_48'] = 'Yes' if product_data['exclusive_online']: product['metadata']['exclusive_online'] = 'Yes' promotion_price = product_data['promotion_price'] if promotion_price: product['metadata']['promotion_price'] = str( promotion_price).replace(',', '.') if identifier in self.metadata_: prev_meta = self.metadata_[identifier] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M') if promo: product['metadata'][ 'promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' else: if promo_start: product['metadata']['promo_start'] = promo_start product['metadata'][ 'promo_end'] = today if not promo_end else promo_end return product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) price = hxs.select('//*[@id="our_price_display"]/text()').extract() price = extract_price(price[0]) product_loader.add_value('price', price) identifier = hxs.select( '//*[@id="product_page_product_id"]/@value').extract()[0] product_loader.add_value('identifier', identifier) name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] product_loader.add_value('name', name) product_loader.add_value('sku', identifier) image_url = hxs.select('//*[@id="bigpic"]/@src').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) category = hxs.select( '//div[@class="breadcrumb clearfix"]//a[not(@class)]/text()' ).extract() product_loader.add_value('category', category) product_loader.add_value('url', response.url) stock = hxs.select('//*[@id="availability_value"]/text()').extract() if stock and stock[0] == u'Este produto não se encontra em stock': product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = SonaeMeta() if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = response.xpath( '//p[@id="reduction_amount" and not(contains(@style,"display:none"))]' '/span[@id="reduction_amount_display" and text()!=""]') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end product['metadata'] = metadata yield product
def parse_product(self, response): if not response.xpath('//body[@id="product"]' ) and not 'body id="product"' in response.body: return promo_dates = response.xpath( '//div[@class="pl_promoinfo_product_promo"]/span[@class="date"]/text()' ).extract() promo_start, promo_end = (None, None) try: promo_dates = [ datetime.datetime.strptime(d, '%d-%m-%Y') for d in promo_dates ] promo_start, promo_end = promo_dates except ValueError: pass loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', '//input[@id="product_page_product_id"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath( '//span[@id="our_price_display"]/text()').extract_first() loader.add_value('price', price.replace(' ', '')) loader.add_xpath('sku', '//span[@itemprop="sku"]/text()') loader.add_xpath('sku', '//script/text()', re="productReference='(.+?)'") category = response.css('.navigation_page ::attr(title)').extract() main_category = response.meta.get('category') if not category or category[0].strip() != main_category: category = [main_category] + category loader.add_value('category', category) loader.add_xpath('image_url', '//img[@id="bigpic"]/@src') loader.add_xpath('brand', '//a[@itemprop="brand"]/span/text()') if not response.css('.primary_block .avail3'): loader.add_value('stock', 0) metadata = SonaeMeta() if promo_start and promo_end: metadata['promo_start'] = promo_start.strftime('%Y-%m-%d') metadata['promo_end'] = promo_end.strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): description_field = response.xpath("//script[contains(text(), 'var dataLayer')]/text()").re('dataLayer = \[(.*)\];')[0] description_field = json.loads(description_field) name = description_field['productName'] price = description_field['productPrice'] brand = description_field['productBrand'] categories = response.css('.breadcrumbs').xpath('li/a/text()').extract() ean = response.css('.w-product-details').xpath(".//li[span[contains(text(), 'EAN')]]/span[@class='details-value']").xpath('text()').extract_first() image_url = response.xpath("//img[@id='product-main-image']/@src").extract_first() identifier = description_field['productId'] ref_code = description_field['productSKU'] two_four_days = bool(response.css('.w-product__availability').xpath('.//p[contains(text(), "2 a 4")]')) l = ProductLoader(item=Product(), response=response) if image_url: l.add_value('image_url', response.urljoin(image_url)) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', price) l.add_value('brand', brand) l.add_value('sku', ean) l.add_value('identifier', identifier) if identifier in self.products: categories = self.products.get(identifier, '').split(' > ') for category in categories: l.add_value('category', category.strip()) product = l.load_item() product['metadata'] = SonaeMeta() product['metadata']['exclusive_online'] = 'No' if two_four_days: product['metadata']['delivery_48_96'] = 'Yes' else: product['metadata']['delivery_96_more'] = 'Yes' if ref_code: product['metadata']['ref_code'] = ref_code yield product
def parse_items(self, response): hxs = HtmlXPathSelector(response=response) description_field = hxs.select("//script[@src = 'http://media.flixfacts.com/js/loader.js']").extract()[0] name = hxs.select("//span[@itemprop = 'name']/text()").extract()[0].encode('ascii', 'ignore') price = hxs.select("//meta[@property = 'og:price:amount']/@content").extract()[0] identifier = re.findall(re.compile('data-flix-mpn="(.+?)"'), description_field)[0] try: sku = re.findall(re.compile('data-flix-ean="(\d*)"'), description_field)[0] except: sku = "" categories = hxs.select("//div[@class = 'breadcrumb']/a/span/text()").extract()[1:4] brand = hxs.select("//span[@itemprop = 'brand']/text()").extract()[0] stock = hxs.select("//section[@class = 'col3']").extract()[0] stock = 1 if not re.findall(re.compile('Out of stock'), stock) else 0 try: image_url = hxs.select("//div[@id = 'currentView']//img[@itemprop = 'image']/@src").extract()[0] except: image_url = "" l = ProductLoader(item=Product(), response=response) l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', price) l.add_value('stock', stock) for category in categories: l.add_value('category', categories) l.add_value('brand', brand) l.add_value('sku', sku) l.add_value('identifier', identifier) product = l.load_item() product['metadata'] = SonaeMeta() if hxs.select('//span[@class="unavailable" and contains(text()[2], "Collect in store")]'): product['metadata']['exclusive_online'] = 'Yes' yield product
def parse_offers_static_page(self, response): rows = response.css('#colsMP tr') if rows: rows = rows[1:] exclusive_online = False if response.meta.get('exclusive_online'): exclusive_online = True product_info = response.meta['product_info'] base_identifier = product_info['base_identifier'].replace('mp', '') if not 'fcom' in base_identifier: base_identifier = 'fcom' + base_identifier self.seen.add(base_identifier.replace('fcom', '')) product_info = response.meta.get('product_info') for row in rows: if row.css('.fnacView'): self.log('Skipping Fnac direct product') continue status = row.css('td.gras').xpath('./text()').extract() if status and 'novo' not in status[0].lower(): self.log('Skipping used product') continue price = row.css('.userPrice').xpath('./text()').extract() if not price: self.log('Price not found') continue else: price = price[0].replace(u'\xa0', '').strip() promotion_price = row.css('.oldPrice').xpath('./text()').extract() if promotion_price: promotion_price = extract_price_eu(promotion_price[0].replace(u'\xa0', '').strip()) shipping_cost = row.css('.noir').xpath('./text()').extract() if shipping_cost: shipping_cost = extract_price_eu(shipping_cost[0].strip()) dealer = row.css('.bleu_MP') if not dealer: self.log('Dealer not found') continue dealer_id = dealer.xpath('./a/@href').extract()[0].split('/')[-1] dealer_name = dealer.xpath('./a/strong/text()').extract()[0].strip() loader = ProductLoader(item=Product(), selector=row) identifier = base_identifier + '-' + dealer_id identifier = self.get_identifier(identifier) loader.add_value('identifier', identifier) loader.add_value('dealer', dealer_name) for c in ['name', 'category', 'brand', 'url', 'image_url', 'sku']: loader.add_value(c, product_info[c]) loader.add_value('price', price) if shipping_cost: loader.add_value('shipping_cost', shipping_cost) product = loader.load_item() metadata = SonaeMeta() if exclusive_online: metadata['exclusive_online'] = 'Yes' metadata['delivery_24_48'] = 'Yes' if promotion_price: metadata['promotion_price'] = str(promotion_price) product['metadata'] = metadata if identifier in self.metadata_: prev_meta = self.metadata_[identifier] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: product['metadata']['promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' else: if promo_start: product['metadata']['promo_start'] = promo_start product['metadata']['promo_end'] = today if not promo_end else promo_end yield product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for url in hxs.select( '//div[@class="stretch clearfix box"]/select/option/@value' ).extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'dont_redirect': True}) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="centerC"]/h1/span[@itemprop="name"]/text()').extract()[0] loader.add_value('name', name) identifier = hxs.select( '//div[@class="pd-container-right"]//form[@class="addBasketItem"]//input[@name="productId"]/@value' ).extract() if not identifier: return loader.add_value('identifier', identifier[0]) loader.add_value('url', response.url) price = hxs.select('//noscript/span/text()').extract() price = extract_price(price[0]) if price else '0' loader.add_value('price', price) stock = hxs.select('//*[@id="first3"]/p/span/text()').extract() stock = stock[0] if stock else '' categories = hxs.select( '//*[@id="infoblock"]/div/a/text()').extract()[1:] for category in categories: loader.add_value('category', category) brand = hxs.select('//div[@class="pd-brand box"]/a/img/@alt').extract() brand = brand[0] if brand else '' loader.add_value('brand', brand) image_url = hxs.select('//*[@id="showPic"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' loader.add_value('image_url', image_url) product = loader.load_item() promotion_price = hxs.select( u'//p[contains(text(), "Preço Regular")]/strike/text()').re( r'[\d,.]+') metadata = SonaeMeta() metadata['exclusive_online'] = 'No' if promotion_price: metadata['promotion_price'] = promotion_price[0].replace( '.', '').replace(',', '.') metadata['stock'] = stock if self.meta_df is not None and not self.meta_df.empty and identifier[ 0] in self.meta_df.index: prev_meta = self.meta_df.loc[identifier[0]] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end product['metadata'] = metadata shipping_pid = hxs.select( '//span[@id="shipmentDetails"]/@data-productid').extract() if shipping_pid: shipping_url = 'https://www.redcoon.pt/req/ajax/mod/ShopShipment/pid/' + shipping_pid[ 0] headers = { 'X-Requested-With': 'XMLHttpRequest', } yield Request(shipping_url, headers=headers, callback=self.parse_shipping, meta={'product': product}) else: yield product
def parse_product(self, response): if response.url in self.old_urls: self.old_urls.remove(response.url) hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select( '//div[@id="searchItem"]//h3[@id="skuName"]/a/@href').extract() if products: for url in products: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta=response.meta) loader = ProductLoader(item=Product(), response=response) identifier = hxs.select( '//div[@itemprop="productID"]/text()').extract() identifier = identifier[0].strip().split( ' ')[-1].strip() if identifier else '' sku = identifier stock = hxs.select("//link[@itemprop='availability']/@href").extract() stock = stock[0] if stock else '' stock = 0 if 'OutOfStock' in stock else 1 name = hxs.select("//h1[@itemprop='name']/text()").extract() name = name[0] if name else '' if not name: return categories = hxs.select( '//div[@id="skuBreadCrumbs"]//span[@itemprop="title"]/text()' ).extract() categories = list(set(categories)) image_url = hxs.select('//img[@id="SkuPageMainImg"]/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' brand = re.findall(re.compile("brandName\":\[\"(.+?)\"\]"), response.body) brand = brand[0] if brand else '' price = hxs.select('//span[@itemprop="price"]/text()').extract() price = price[0].replace('.', '').replace( ',', '.').strip() if price else '0.00' loader.add_value('price', price) price = loader.get_output_value('price') if price: price = Decimal(price) if price <= 48.99: loader.add_value('shipping_cost', '3.00') loader.add_value('stock', stock) loader.add_value('brand', brand.decode('utf-8')) loader.add_value('url', response.url) loader.add_value('image_url', image_url) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) for category in categories: loader.add_value('category', category) product = loader.load_item() metadata = SonaeMeta() promotional_price = hxs.select('//div[@id="SkuSaveStory"]//span[contains(@class, "strike") ' 'and contains(@class, "darkGray")]/text()') \ .re(r'[\d,.]+') if promotional_price: metadata['promotion_price'] = promotional_price[0].replace( '.', '').replace(',', '.') if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = promotional_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end metadata['delivery_24'] = 'Yes' product['metadata'] = metadata yield product
def parse_product(self, response): product_loader = ProductLoader(item=Product(), response=response) price = response.xpath( '//div[@class="product-info"]//span[@class="price-fixed"]/text()' ).extract()[0] price = extract_price(price) product_loader.add_value('price', price) identifier = response.xpath( '//input[@name="product_id"]/@value').extract()[0] product_loader.add_value('identifier', identifier + '-new') name = response.xpath( '//div[@class="product-info"]//h1/text()').extract_first() product_loader.add_value('name', name) sku = response.xpath( '//div[@class="description"]' '/span[contains(text(), "digo do produto")]' '/following-sibling::text()[1]').extract()[0].strip() product_loader.add_value('sku', sku) brand = response.xpath('//div[@class="description"]' '/span[contains(text(), "Fabricantes")]' '/following-sibling::a[1]/text()').extract() brand = brand[0].strip() if brand else '' product_loader.add_value('brand', brand) stock_text = response.xpath( '//div[@class="description"]' '/span[contains(text(), "Disponibilidade")]' '/following-sibling::text()[1]').extract()[0].strip() stock = u'Dispon\xedvel para Encomenda' in stock_text if not stock: product_loader.add_value('stock', 0) image_url = response.xpath( '//div[@class="product-info"]//div[contains(@class, "image")]/a/@href' ).extract() if image_url: product_loader.add_value('image_url', image_url[0]) category = response.xpath( '//div[@class="breadcrumb"]/a/text()').extract()[1:-1] product_loader.add_value('category', category) product_loader.add_value('url', response.url) product = product_loader.load_item() product['metadata'] = SonaeMeta() lookup_id = identifier + '-new' if self.meta_df is not None and not self.meta_df.empty and lookup_id in self.meta_df.index: prev_meta = self.meta_df.loc[lookup_id] else: prev_meta = {} promo = response.meta.get('promo', False) promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M') if promo: product['metadata'][ 'promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' else: if promo_start: product['metadata']['promo_start'] = promo_start product['metadata'][ 'promo_end'] = today if not promo_end else promo_end yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) price = response.xpath( u'//tr[th[contains(text(), "Preço Campanha")]]/td/span/text()' ).extract() if not price: price = response.xpath( u'//tr[th[contains(text(), "Preço")]]/td/span/text()').extract( ) price = extract_price(price[0]) product_loader.add_value('price', price) identifier = response.xpath( '//input[@name="products_id"]/@value').extract()[0] product_loader.add_value('identifier', identifier) name = response.xpath( '//div[@id="my_header"]//h2/text()').extract()[0].strip() product_loader.add_value('name', name) sku = response.xpath('//span[@class="smallText"]/text()').re( 'EAN\[(.*)\]') sku = sku[0] if sku else '' product_loader.add_value('sku', sku) image_url = response.xpath('//a[@rel="fancybox"]/img/@src').extract() if image_url: product_loader.add_value('image_url', response.urljoin(image_url[0])) category = response.xpath( '//div[@id="my_header"]//a/text()').extract()[-3:] product_loader.add_value('category', category) product_loader.add_value('url', response.url) product_loader.add_xpath('brand', '//div/@data-product-manufacture') metadata = SonaeMeta() promo = response.xpath( '//div[@class="discount_block"]/span[@class="discount_block_text" and text()]' ) if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end stock = response.xpath('//div/@data-product-availability').extract()[0] if stock: stock = '1' == stock[0] else: stock = False if not stock: product_loader.add_value('stock', 0) product = product_loader.load_item() product['metadata'] = metadata yield product
def parse_products(self, response): data = json.loads(response.body) products = data['response']['products'] if products: u_id = response.meta['u_id'] u_cat = response.meta['u_cat'] offset = response.meta['offset'] for product in products: product_loader = ProductLoader(item=Product(), response=response) if product['price']: product_loader.add_value('identifier', product['id']) product_loader.add_value('name', product['title']) product_loader.add_value('sku', product['id']) price = product['price']['value'].replace(' ', '').replace( '.', '').replace(',', '.') product_loader.add_value('price', price) product_loader.add_value( 'image_url', response.urljoin(product['featured_image']['source'])) product_loader.add_value('url', product['url']) product_loader.add_value('brand', product['brand']['name']) if product['variants'][0]['inventory_quantity'] == '0': product_loader.add_value('stock', 0) product_loader.add_value('category', product['category']) exclusive_online = False metadata = SonaeMeta() promo = False for tag in product['tags']: if u'promo' in tag['title'].lower(): promo = True if u"PromoçãoOnline" in tag['title'].title().replace( ' ', ''): exclusive_online = True if self.meta_df is not None and not self.meta_df.empty and product[ 'id'] in self.meta_df.index: prev_meta = self.meta_df.loc[product['id']] else: prev_meta = {} promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata[ 'promo_end'] = today if not promo_end else promo_end if exclusive_online: metadata['exclusive_online'] = 'Yes' item = product_loader.load_item() item['metadata'] = metadata yield item yield scrapy.Request( 'http://www.phonehouse.pt/api.php/getProducts/' + u_id + '/' + u_cat + '/' + str(offset + 12), callback=self.parse_products, meta={ 'u_id': u_id, 'u_cat': u_cat, 'offset': offset + 12 })
def parse_product(self, response): if 'SQLSTATE' in response.body: retry_req = self._retry(response) if retry_req: yield retry_req else: self.log('Error parsing {}'.format(response.url)) return loader = ProductLoader(item=Product(), response=response) name = response.xpath('//h1[@itemprop="name"]/text()').extract_first() identifier = response.xpath( '//*[@id="product_page_product_id"]/@value').extract_first() image_url = response.xpath('//*[@id="bigpic"]/@src').extract_first() price = response.xpath( '//*[@id="our_price_display"]/text()').extract_first('0') sku = response.xpath( '//label[text()="EAN "]/../span/text()').extract_first() brand = response.xpath( '//label[text()="Fabricante "]/../span/text()').extract_first() categories = response.xpath( '//div[@class="breadcrumb clearfix"]//a/text()').extract() stock = response.xpath( '//span[@class="avail-label"]/text()').extract_first() loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('brand', brand) loader.add_value('category', categories) loader.add_value('url', response.url) if image_url and not image_url.strip().startswith('data:image'): loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('price', extract_price(price.replace(' ', ''))) if not stock: loader.add_value('stock', 0) item = loader.load_item() if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = response.xpath( '//p[@id="reduction_amount" and not(contains(@style,"display:none"))]' '/span[@id="reduction_amount_display" and text()!=""]') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata = SonaeMeta() metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end item['metadata'] = metadata yield item
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response=response) name = hxs.select('//div[@class="prod-nome"]/text()').extract() price = hxs.select('//div[@class="prod-price "]/text()').extract() if not price: price = hxs.select( '//div[@class="prod-price campanha"]/text()').extract() price = price[0] brand = '' categories = hxs.select( '//div[@id="breadcrumb"]/ul/li/a/text()').extract()[1:] l = ProductLoader(item=Product(), response=response) image_url = hxs.select('//div[@id="prod-imagem"]/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', extract_price_eu(price)) l.add_value('brand', brand) for category in categories: l.add_value('category', category) ean = hxs.select('//script[@data-flix-ean]/@data-flix-ean').extract() l.add_value('sku', ean) identifier = re.findall('idprod=(.*)', response.url)[0] l.add_value('identifier', identifier) product = l.load_item() metadata = SonaeMeta() promotion_price = hxs.select( '//div[@class="prod-price-old"]/del/text()').re(r'[\d,.]+') if promotion_price: metadata['promotion_price'] = promotion_price[0].replace( '.', '').replace(',', '.') if response.meta.get('exclusive_online', 'No') == 'Yes': metadata['exclusive_online'] = 'Yes' if self.meta_df is not None and not self.meta_df.empty and identifier in self.meta_df.index: prev_meta = self.meta_df.loc[identifier] else: prev_meta = {} promo = hxs.xpath( '//div[@id="prod-data"]//div[@class="prod-price campanha"]') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M') if promo: metadata[ 'promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end product['metadata'] = metadata yield product
def parse_product(self, response): l = ProductLoader(item=Product(), response=response) metadata = SonaeMeta() l.add_xpath('image_url', '//img[contains(@class, "product-detail-img-main")]/@src') l.add_value('url', response.url) name = response.xpath('//h1/text()').extract()[0].strip() #name_desc = ''.join(hxs.select('//span[@class="infoDet"]/text()').extract()).strip() #l.add_value('name', name + ' ' + name_desc) l.add_value('name', name) price = ''.join(response.xpath('//span[@class="item-price"]/text()').extract()[0].strip().split()) l.add_value('price', extract_price(price)) out_of_stock = response.xpath(u'//div[@class="product-btns-panel"]/button[contains(text(), "Indisponível")]') if out_of_stock: l.add_value('stock', 0) categories = response.xpath('//ol[@class="breadcrumb"]/li/a/text()').extract()[1:] for category in categories: l.add_value('category', category) brand = response.xpath('//div[h1]/h3/text()').extract() if brand: l.add_value('brand', brand[0]) ''' weight = response.xpath('//div[h2[contains(text(), "Peso")]]/p/text()').extract() if not weight: weight = response.xpath('//tr[td[contains(text(), "Peso")]]/td/@txt').extract() weight = extract_price(weight[0]) if weight else 0 shipping = 0 if weight>=0.5 and weight<3: shipping = 2 if weight>=3 and weight<5: shipping = 4 if weight>=5 and weight<10: shipping = 5 if weight>=10 and weight<20: shipping = 10 if weight>=20: shipping = 15 if shipping: l.add_value('shipping_cost', shipping) ''' identifier = response.xpath('//input[@name="Id"]/@value').extract() l.add_value('identifier', identifier[0]) l.add_value('sku', identifier[0]) if self.meta_df is not None and not self.meta_df.empty and identifier[0] in self.meta_df.index: prev_meta = self.meta_df.loc[identifier[0]] else: prev_meta = {} promo = response.xpath('//span[@class="item-old-price"]/span[@class="item-old-price"]/text()') promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') metadata['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') if promo: metadata['promo_start'] = promo_start if promo_start and not promo_end else today metadata['promo_end'] = '' else: if promo_start: metadata['promo_start'] = promo_start metadata['promo_end'] = today if not promo_end else promo_end item = l.load_item() item['metadata'] = metadata yield item
def parse_item(self, response): self.log('Parsing {} in parse_item'.format(response.url)) if response.url.endswith("/index.html"): return response = response.replace( body=response.body.decode('utf-8', errors='ignore')) hxs = HtmlXPathSelector(response=response) description_field = hxs.select( '''//script[contains(text(), '"prdref"')]''').extract() description_field = description_field[0] if description_field else '' try: name = re.findall(re.compile('\"prdname\"\,\"(.+?)\"'), description_field)[0] except: name = hxs.select("//span[@itemprop = 'name']/text()").extract()[0] sku = response.meta.get('sku', None) if not sku: url_orig = response.meta['url'] if url_orig in self.skus: sku = self.skus[url_orig] else: sku = re.findall(re.compile('\"prdref\"\,\"(.+?)\"'), description_field)[0] brand = re.findall(re.compile('\"prdparam-brand\"\,\"(.+?)\"'), description_field) #stock = hxs.select("//div[@class = 'availability tsp']").extract()[0] #stock = 1 if re.findall(re.compile('Em stock'), stock) else 0 identifier = hxs.select( "//input[@name = 'sProductId']/@value").extract()[0] categories = hxs.select( '//div[@class="breadcrumb"]//span[@itemprop="title"]/text()' ).extract() categories = [ c for c in categories if c.strip() and c.strip().lower() != 'home' ][:3] try: image_url = hxs.select( "//article[@class='product cancelOverfProduct col9']//img/@src" ).extract()[0] except: image_url = "" dealers = hxs.select('//div[@class="merchant product"]') for dealer in dealers: l = ProductLoader(item=Product(), response=response) stock = dealer.select('.//span[@class="available"]') price = dealer.select( './/span[@class="currentPrice"]//text()').extract() price = ''.join(price).replace(',', '.') shipping = dealer.select( './/div[@class="productPrices"]/span/text()').extract() shipping = ''.join(shipping[-1]).replace(',', '.') if shipping else '0' seller = dealer.select( './/p[@class="soldby"]/strong/a//text()').extract() #merchant_id = dealer.select('//input[@name="iMerchantId"]/@value').extract()[0] if not seller: seller = ['Pixmania'] if 'Pixmania' not in seller: continue prod_id = identifier + '-' + seller[0].lower() l.add_value('image_url', image_url) l.add_value('url', response.url) l.add_value('name', name) l.add_value('price', price) if not stock: l.add_value('stock', 0) l.add_value('category', categories) if brand: l.add_value('brand', brand[0]) l.add_value('shipping_cost', shipping) l.add_value('identifier', prod_id) l.add_value('dealer', seller) l.add_value('sku', sku) product = l.load_item() metadata = SonaeMeta() metadata['exclusive_online'] = 'Yes' delivery = dealer.re(r'([\d-]+?) dias') if delivery: if '-' in delivery[0]: delivery = [delivery[0].split('-')[0]] delivery_days = int(delivery[0]) if delivery_days == 1: metadata['delivery_24'] = 'Yes' elif delivery_days == 2: metadata['delivery_24_48'] = 'Yes' elif delivery_days < 5: metadata['delivery_48_96'] = 'Yes' elif delivery_days >= 5: metadata['delivery_96_more'] = 'Yes' previous_price = dealer.select( './/span[@class="previousPrice"]/del/text()').re(r'[\d,.]+') if previous_price: metadata['promotion_price'] = previous_price[0].replace( '.', '').replace(',', '.') product['metadata'] = metadata yield product
def parse_product(self, response): self.log("[[TEST]] parse_product") description_field = response.xpath( "//script[contains(text(), 'tc_vars')]/text()").extract_first() if not description_field: raise ValueError("Could not find description field: {}".format( response.url)) m = re.findall(r'tc_vars\["product_id"\]\s*=\s*"([^"]*)"', description_field) identifier = m[0] if not identifier: raise ValueError("Identifier not found: {}".format(response.url)) identifier = 'fcom' + identifier name = response.css('.ProductSummary-title').xpath( "//*[@itemprop='name']/text()").extract_first().strip() if not name: raise ValueError("Name not found: {}".format(response.url)) subname = response.css('.ProductSummary-subTitle').xpath( "span[a]/preceding-sibling::span/text()").extract_first() if subname: name = ' '.join([name, subname]) m = re.findall(r'tc_vars\["product_EAN"\]\s*=\s*"(\d*)"', description_field) sku = m[0] if m else '' price = response.xpath( '//*[@class="ProductSellers-tabControlText" and contains(text(), "Fnac")]//text()' ) if price: price = ' '.join(price.extract()).replace(u'\xa0', '') price = re.search('([\d,]+)', price, re.MULTILINE | re.DOTALL) if price: price = price.groups()[0] self.log(price) if not price and not identifier in self.products: self.log('Price not found {}'.format(response.url)) return stock = 1 if price else 0 category_01 = response.css('.Breadcrumb-list').css( '.Breadcrumb-item').css('[itemprop=title]')[1].xpath( 'text()').extract_first() try: category_02 = response.css('.Breadcrumb-list').css( '.Breadcrumb-item').css('[itemprop=title]')[2].xpath( 'text()').extract_first() except IndexError: category_02 = '' m = re.findall(r'tc_vars\["product_trademark"\]\s*=\s*"([^"]*)"', description_field) brand = m[0] if m else '' shipping = response.css('.Delivery').xpath('.//text()').extract() if shipping: shipping = ''.join(shipping).strip() shipping = re.search('([\d,]+)', shipping) if shipping: shipping = shipping.groups()[0] shipping = extract_price_eu(shipping) else: shipping = '' m = re.findall(r'tc_vars\["product_picture_url"\]\s*=\s*"([^"]*)"', description_field) image_url = m[0] l = ProductLoader(item=Product(), response=response) self.seen.add(identifier) l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('url', response.url) l.add_value('sku', sku) l.add_value('price', price) if not stock: l.add_value('stock', stock) l.add_value('category', category_01) l.add_value('category', category_02) l.add_value('brand', brand) l.add_value('shipping_cost', shipping) l.add_value('image_url', image_url) l.add_value('dealer', 'Fnac') product = l.load_item() product['metadata'] = SonaeMeta() product['metadata']['delivery_24_48'] = 'Yes' if response.meta.get('exclusive_online'): product['metadata']['exclusive_online'] = 'Yes' promotion_price = response.css('.ProductPriceBox').css( '.oldPrice').xpath("text()").extract_first() if promotion_price: promotion_price = promotion_price.strip().replace( u'\xa0', '').replace(u'\u20ac', '').replace(' ', '') product['metadata']['promotion_price'] = str( extract_price_eu(promotion_price)) if identifier in self.metadata_: prev_meta = self.metadata_[identifier] else: prev_meta = {} promo = promotion_price promo_start = prev_meta.get('promo_start') promo_end = prev_meta.get('promo_end') today = datetime.datetime.now().strftime('%Y-%m-%d') product['metadata']['extraction_timestamp'] = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M') if promo: product['metadata'][ 'promo_start'] = promo_start if promo_start and not promo_end else today product['metadata']['promo_end'] = '' else: if promo_start: product['metadata']['promo_start'] = promo_start product['metadata'][ 'promo_end'] = today if not promo_end else promo_end yield product