def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) url = hxs.select('//link[@rel="canonical"]/@href').extract()[0] options = hxs.select( '//div[@class="BBFLW100 pdSelections"]/select/option[@selected="selected"][@value="0"]' ) if options: for option in options.select('../option[@value!="0"]'): event = option.select('../@name').extract()[0] formdata = { '__VIEWSTATE': hxs.select( "//input[@id='__VIEWSTATE']/@value").extract()[0], '__VIEWSTATEGENERATOR': hxs.select("//input[@id='__VIEWSTATEGENERATOR']/@value"). extract()[0], '__EVENTTARGET': event, event: option.select('@value').extract()[0] } yield FormRequest(url, formdata=formdata, callback=self.parse_product, dont_filter=True, meta={'event': event}) return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//h1[@id="h1ProdName"]/text()') loader.add_xpath( 'category', '//div[@id="Breadcrumb"]//span[@itemprop="title"]/text()[.!="Home" and .!="Offers"]' ) loader.add_xpath('image_url', '//img[@id="imgProdMainImg"]/@src') loader.add_xpath( 'brand', '//div[@id="pnlManufacturer"]/meta[@itemprop]/@content') loader.add_xpath( 'shipping_cost', '//div[@id="pdEstmtdDlvrDesc"]/ul[1]/li[@class="charges"]/text()') if not hxs.select( '//div[@id="pdStock"]/span[text()="In Stock"]').extract(): loader.add_value('stock', 0) loader.add_xpath('identifier', '//span[@id="lblProdCode"]/text()') loader.add_xpath( 'price', '//div[@id="pnlProdPriceNStock"]//span[@itemprop="price"]/text()') loader.add_xpath('sku', '//span[@id="lblProdCode"]/text()') item = loader.load_item() promotions = hxs.select('//div[@class="was-saveprice FL"]/style/text()' ).re('{content:"(.+)"}') if promotions: metadata = MetaData() metadata['Promotions'] = promotions[0] item['metadata'] = metadata yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//meta[@property="og:title"][1]/@content') identifier = response.meta['id'] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath( 'price', '//span[@class="price-including-tax"]/span[@class="price"]/text()') loader.add_xpath('image_url', '//div[@class="product-img-box"]//img/@src') loader.add_xpath( 'category', '//div[@class="breadcrumbs"]/ul/li[position()>1]/a/text()') brand = hxs.select( '//th[text()="Manufacturer"]/../td/text()').extract() if brand: loader.add_value('brand', brand[0]) if not hxs.select('//p[@class="availability in-stock"]/span'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) box = hxs.select('//div[@class="prod-box"]') crumbs = hxs.select('//ul[@class="breadcrumbs"]')[0] loader = ProductLoader(selector=box, item=Product()) loader.add_value('url', response.url) brand = crumbs.select('.//a[contains(text(), "Brands")]/../following-sibling::li[1]/a/text()').extract() loader.add_value('brand', brand) categories = crumbs.select('.//a/text()').extract() categories = [cat for cat in categories if "Brand" not in cat] loader.add_value('category', categories) image_url = hxs.select('//section[@id="one"]//@src').extract() if not image_url: yield Request(response.url, callback=self.parse_category, dont_filter=True) return loader.add_value('image_url', urljoin(base_url, image_url[0])) loader.add_xpath('name', './h1/text()') loader.add_xpath('identifier', '//*/@prodref') loader.add_xpath('sku', '//*/@prodref') if not box.select('//*[text()="In Stock" or text()="Low Stock"]'): loader.add_value('stock', 0) loader.add_xpath('price', './/span[@class="product-price"]/text()') product = loader.load_item() if product['price'] < 20: product['shipping_cost'] = 2 elif product['price'] < 40: product['shipping_cost'] = 4.99 yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = filter( lambda p: p.strip(), hxs.select("//span[@class='regular-price']//text()").extract())[1:] loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//div[@class='product-name']//h1//text()") loader.add_xpath( 'category', "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()" ) brand = hxs.select( "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()" ).extract() loader.add_value('brand', brand) loader.add_value('shipping_cost', 0) loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()') loader.add_xpath( 'identifier', "//div[@class='product-view']//input[@name='product']/@value") image_urls = hxs.select( '//img[contains(@class, "gallery-image")]/@src').extract() for image_url in image_urls: if len(image_url) < 1024: loader.add_value('image_url', image_url) break product = loader.load_item() if product['price'] > 0: yield product
def parse_node(self, response, node): loader = ProductLoader(item=Product(), selector=node) size = node.xpath('./*[local-name()="size"]/text()').extract() color = node.xpath('./*[local-name()="color"]/text()').extract() material = node.xpath('./*[local-name()="material"]/text()').extract() name = node.xpath('./*[local-name()="parent_title"]/text()').extract() if not name: name = node.xpath('./title/text()').extract() name = name[0] if material: name += u' {}'.format(material[0]) if color: name += u' {}'.format(color[0]) if size: name += u' {}'.format(size[0]) loader.add_value('name', name) loader.add_xpath('url', './link/text()') loader.add_xpath('image_url', './*[local-name()="image_link"]/text()') loader.add_xpath('identifier', './*[local-name()="id"]/text()') loader.add_xpath('price', './*[local-name()="price"]/text()') loader.add_xpath('shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()') loader.add_xpath('brand', './*[local-name()="brand"]/text()') loader.add_xpath('category', './*[local-name()="google_product_category"]/text()') loader.add_xpath('sku', './*[local-name()="mpn"]/text()') stock = node.xpath('./*[local-name()="availability"]/text()').extract() if stock and stock[0] == 'out of stock': loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') if not loader.get_collected_values('name'): return loader.add_xpath('price', '//span[@class="full-price"]/text()') stock = response.xpath( '//div[contains(@class, "low-stock")]') or response.xpath( '//div[contains(@class, "no-stock")]') if stock: loader.add_value('stock', 0) categories = response.xpath( '//ul[@class="the-breadcrumb-list"]//span[@itemprop="title"]/text()' ).extract() for category in categories: if category.title() not in ('Home', 'Search Results'): loader.add_value('category', category) #loader.add_xpath('category', '//li[@class="terain-type"]/text()') brand = response.meta.get('brand') if not brand: brand = response.xpath( '//div[@class="product-brand"]/a/@href').extract()[0] loader.add_value('brand', brand.strip('/').replace('-', ' ')) loader.add_xpath('identifier', response.url.rpartition('_')[-1]) loader.add_value('sku', response.url.rpartition('_')[-1]) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()') brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"') loader.add_value('brand', brand) loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src') loader.add_value('url', url_query_cleaner(response.url)) loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value') item = loader.load_item() if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'): data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract() data = data.replace('\n', '').replace('null', '"null"') data = re.search('stockMatrix = (.*?);', data, re.DOTALL) data = json.loads(data.group(1)) if data else [] for i, variant in enumerate(data): sku = [elem for elem in variant if elem.startswith('sku')][0] sku_idx = variant.index(sku) product = Product(item) product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title() product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i) product['sku'] = product['identifier'] product['price'] = variant[sku_idx + 2] product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0 yield product return loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value') stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0 loader.add_value('stock', stock) yield loader.load_item()
def parse_product(self, response): for url in response.css('.facet-nav a::attr(href)').extract(): yield Request(response.urljoin(url), self.parse_product) xpath = '//meta[@property="%s"]/@content' loader = ProductLoader(item=Product(), response=response) loader.add_xpath('identifier', xpath % 'product:retailer_part_no') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', xpath % 'og:title') #loader.add_xpath('name', xpath %'product:color') loader.add_xpath('price', xpath % 'product:price:amount') loader.add_xpath('sku', xpath % 'product:retailer_part_no') category = response.xpath( '//ul[@itemprop="breadcrumb"]//a/text()').extract() category.remove('Home') category.remove('Products') category.pop(-1) loader.add_value('category', category[-3:]) loader.add_xpath('image_url', xpath % 'og:image') loader.add_xpath('brand', xpath % 'product:brand') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '3.99') item = loader.load_item() if item.get('identifier'): yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="productId"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath('//text()').re('productId=(.+?)&') loader.add_value('identifier', identifier) loader.add_value('url', url_query_cleaner(response.url)) loader.add_css('name', 'div.productTitleDescriptionContainer h1::text') loader.add_css('price', 'p.pricePerUnit::text') loader.add_css('sku', 'p.itemCode::text', re='Item code:(.+)') category = response.xpath( '//ul[@id="breadcrumbNavList"]//a/span/text()').extract() if 'Home' in category: category.remove('Home') loader.add_value('category', category) image_url = response.css( 'img#productImageID::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() item['metadata'] = {'reviews': []} review_id = response.xpath('//text()').re_first("productId: '(.+?)'") reviews_url = 'http://sainsburysgrocery.ugc.bazaarvoice.com/8076-en_gb/%s/reviews.djs?format=embeddedhtml' % review_id yield Request(reviews_url, callback=self.parse_review_page, meta={'item': item})
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css('span#thisstkcode::text').extract_first() if not identifier: retries = response.meta.get('retries', 0) if retries > 9: self.logger.warning('No identifier found on %s' % response.url) else: self.logger.debug('Retry %s to get identifier' % response.url) meta = response.meta meta['retries'] = retries + 1 yield response.request.replace('dont_filter=True', meta=meta) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') price = response.css('span.prodPrice').xpath( './/span[@itemprop="price"]/text()').extract_first() loader.add_value('price', price) category = response.css('.breadcrumbs span::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', '.main-product-photo::attr(href)') loader.add_css('brand', 'span#thisbrand::text') loader.add_css('stock', 'input#data-stock-qty::attr(value)') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) models = response.xpath( '//div[contains(@class, "row-fluid") and .//table[@class="data"] and div[contains(@class, "media span")]]' ) for model in models: loader = ProductLoader(item=Product(), selector=model) name = model.xpath('.//p/strong//text()').extract()[-1].strip() if not name: name = model.xpath( './/p/strong[contains(text(), "Ford")]//text()').extract( )[-1].strip() loader.add_value('name', name) prices = model.xpath( './/tr[td[contains(text(), "Cash")]]/td[not(contains(text(), "Cash"))]/text()' ).re('\d+,\d+') prices = map(extract_price, prices) price = min(prices) loader.add_value('price', price) image_url = model.xpath( './/picture/source/@data-placeholder').extract() image_url = 'http:' + image_url[0] if image_url else '' loader.add_value('image_url', image_url) loader.add_value('identifier', '_'.join(name.split())) loader.add_value('url', response.url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] if not hxs.select('//div[@class="productDetail"]'): return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value('sku', row['PRODUCT_NUMBER']) product_loader.add_value('identifier', row['PRODUCT_NUMBER']) product_loader.add_xpath( 'name', u'//div[@class="productDescription"]/h3/text()|//div[@class="productDescription"]/h4/text()' ) product_loader.add_xpath( 'brand', u'//div[@class="productDescription"]/h2/text()') if hxs.select('//input[contains(@class, "purchaseButton")]'): product_loader.add_value('stock', '1') product_loader.add_xpath( 'category', '//p[@id="breadCrumbs"]/a[position() > 1]/text()') img = hxs.select(u'//img[@class="productImage"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) product_loader.add_xpath('price', './/span[@class="ourPrice"]/text()') item = product_loader.load_item() if item['price'] < 25: item['shipping_cost'] = Decimal('1.95') yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="productid"]/@value') loader.add_value('url', response.url) loader.add_css('name', '.descr::text') loader.add_css('price', 'span.currency::text') loader.add_value('sku', response.meta['sku']) image_url = response.css( 'img#product_thumbnail::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', response.meta['brand']) stock = response.css('.quantity script::text').re( 'product_avail = (\d+);')[0] loader.add_value('stock', stock) item = loader.load_item() if stock == '0': yield item return request = FormRequest.from_response(response, formname='orderform', meta={ 'cookiejar': item['identifier'], 'item': Product(item) }, cookies=self.cookies, callback=self.parse_shipping, dont_filter=True) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', '//h1//text()') product_loader.add_xpath('identifier', '//input[@name="product_id"]/@value') try: sku = hxs.select( '//div[@class="description"]/span[contains(text(), ' '"Codice")]/following-sibling::text()').extract()[0].strip() except: sku = '' product_loader.add_xpath('sku', '//input[@name="product_id"]/@value') product_loader.add_xpath('image_url', '//img[@id="image"]/@src') brand = response.css('.description').xpath( './/a/span/text()').extract_first() product_loader.add_value('brand', brand) category = response.css('.breadcrumb').xpath( 'li[2]/a/span/text()').extract() product_loader.add_value('category', category) price = extract_price_eu( hxs.select('//div[@class="price"]/span/text()').extract()[0]) product_loader.add_value('price', price) stock = ''.join( hxs.select( '//div[@class="description"]/span/strong[contains(text(), ' '"Disponibilit")]/../following-sibling::text()').extract() ).strip().lower() if stock and not 'in magazzino' in stock: product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_value('url', response.url) loader.add_xpath( 'brand', './/dt[text()="Brand"]/following-sibling::dd[1]/text()') loader.add_xpath('category', './/div[contains(@class, "breadcrumbs")]//a/text()') if hxs.select('//article[@id="product"]'): image_url = hxs.select( './/div[@id="amplienceContent"]//img/@src').extract() loader.replace_value('image_url', urljoin(base_url, image_url[0])) options = hxs.select( '//script[@type="text/javascript"]/text()[contains(., "productData")]' ).extract() for item in self.parse_options(hxs, base_url, loader, options): yield item for product in hxs.select('//article[@class="bdp-item"]'): image_url = product.select( './/a[contains(@id, "mainImage")]/img/@src').extract()[0] loader.replace_value('image_url', urljoin(base_url, image_url)) options = product.select( './div/div[1]//script[@type="text/javascript"]/text()' ).extract() for item in self.parse_options(product, base_url, loader, options): yield item
def parse_row(self, response, row): loader = ProductLoader(Product(), response=response) loader.add_value('identifier', row['Rapid Code']) loader.add_value('name', row['Description']) loader.add_value('sku', row['Manufactures Code']) loader.add_value('brand', row['Brand']) loader.add_value('url', row['URL']) yield Request(row['URL'], self.parse_product, meta={'loader': loader})
def parse_products(self, response): data = json.loads(response.body) for item in data['items']: loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', item['id']) loader.add_value('sku', item['id']) loader.add_value('name', item['nm']) loader.add_value('price', item['p']) loader.add_value('url', response.urljoin(item['l'])) loader.add_value('image_url', response.urljoin(item['img'])) yield loader.load_item()
def parse_simple_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_css('name', 'div.product-name h1::text') loader.add_css('price', 'li.bigPrice span.price::text') loader.add_xpath('sku', '//input[@name="product"]/@value') category = response.css('div.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image::attr(src)') item = loader.load_item() yield item
def parse_frames(self, response): base_url = get_base_url(response) products = response.xpath('//tr/td[text()="Code"][1]') if products: margin = 3 else: products = response.xpath('//tr/td[span/text()="CODE"][1]') if products: margin = 2 if not products: self.log('No products found on %s' % response.url) identifiers = [] image_url = response.xpath( '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]' ).extract() for product in products: for idx, option in enumerate( product.xpath( './../preceding-sibling::tr[1]/td[position()>1]')): name = option.xpath('.//text()').extract() for size in product.xpath('./../following-sibling::tr'): if size.xpath( 'td[(text()="Code") or (span/text()="CODE")]'): break if not size.xpath('./td[1][contains(.//text(), " x")]'): continue loader = ProductLoader(item=Product(), selector=size) loader.add_value('name', name) size_name = size.xpath('td[1]/text()').extract() loader.add_value('name', size_name) loader.add_xpath('sku', 'td[%d]/text()' % (idx * 2 + margin)) loader.add_xpath('price', 'td[%d]/text()' % (idx * 2 + margin + 1)) if not loader.get_output_value('sku'): continue identifier = loader.get_output_value( 'sku') + '-' + '-'.join(re.findall( '\d+', size_name[0])) identifier += '-' + response.url.split('/')[-1].split( '_')[0].split('.')[0] while identifier in identifiers or identifier in self.ids_seen: identifier += '-d' identifiers.append(identifier) self.ids_seen.append(identifier) loader.add_value('identifier', identifier) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(base_url, image_url[0])) yield loader.load_item()
def parse_category(self, response): category = response.css('li.last::text').extract() products = response.xpath('//div[@typeof="Product"]') for product in products: loader = ProductLoader(Product(), selector=product) loader.add_xpath('identifier', './/*[@property="url"]/@sku') url = product.xpath('.//*[@property="url"]/@href').extract_first() loader.add_value('url', response.urljoin(url)) loader.add_xpath('name', './/*[@property="url"]/text()') loader.add_xpath('price', './/*[@property="price"]/text()') loader.add_xpath('sku', './/*[@property="url"]/@sku') loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()') loader.add_value('category', category) loader.add_xpath('image_url', './/*[@property="image"]/@content') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '9.95') if product.xpath('.//button[starts-with(@id, "outOfStock")]'): loader.add_value('stock', 0) yield loader.load_item() if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url): return filters = response.css('ul.filters input::attr(id)').re('^\S{5}$') for filt in filters: url = response.url + '/' + filt yield Request(url, self.parse_category)
def parse_car(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) identifier = response.url.split('/')[-2] price = hxs.select( '//td[contains(text(), "Cash Price")]/following-sibling::td/text()' ).extract() if not price: price = hxs.select('//h2/text()').re( 'Manager\'s Special Price (.*)') if not price: return loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) name = hxs.select( '//div[@class="textInner"][./h2]/*//strong/text()').extract() if name: name = name[0] loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) xpath = '//div[@class="nosto_product"]/span[@class="%s"]/text()' if not response.xpath('//div[@class="nosto_product"]'): for product in self.parse_category(response): yield product return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) for s in ('name', 'price', 'image_url', 'brand'): loader.add_xpath(s, xpath % s) loader.add_xpath('identifier', xpath % 'product_id') loader.add_xpath('sku', '//h6[@class="product-model"]/text()') category = hxs.select(xpath % 'category').extract() if category: category.sort() loader.add_value('category', category[-1].strip('/').split('/')) loader.add_value('shipping_cost', 29.99) if 'InStock' not in hxs.select(xpath % 'availability').extract(): loader.add_value('stock', 0) item = loader.load_item() if 'Ex Display' in item['name']: item['metadata'] = {'Ex Display': 'Ex Display'} yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) if hxs.select('//a[@href="#product-range"]'): for url in hxs.select( '//section[contains(@class, "product-range")]//div/a/@href' ).extract(): yield Request(urljoin(base_url, url), callback=self.parse_product) return loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', '//h1[@class="fn c-both"]/text()') loader.add_xpath('price', ('//span[@class="cta now-price"]/text()', '0')) if not hxs.select('//select[@id="quantity"]'): loader.add_value('stock', 0) categories = hxs.select( '//section[@class="breadcrumbs"]//a/text()').extract()[2:-1] if 'in the kitchen' in categories: categories.remove('in the kitchen') if 'baking' in categories: categories.remove('baking') loader.add_value('category', categories) loader.add_value('brand', "Lakeland") loader.add_xpath('identifier', '//meta[@name="productcode"]/@content') loader.add_xpath('sku', '//meta[@name="productcode"]/@content') loader.add_xpath('image_url', '//img[@class="main-image"]/@src') loader.add_value('url', response.url) product = loader.load_item() if product.get('price', 30) < 30: product['shipping_cost'] = 2.99 yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) base_url = '/'.join(base_url.split('/')[:3]) product = {} product['identifier'] = response.xpath( '//input[@name="elementID"]/@value').extract_first() if not response.css('span.product-in-store'): product['stock'] = 0 product['name'] = response.xpath( '//h1[@itemprop="name"]/text()').extract_first() product['price'] = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() product['url'] = response.url product['brand'] = hxs.select( u'//dt[contains(., "Производитель")]/following-sibling::dd/span/text()' ).extract_first() if not product['brand']: product['brand'] = response.xpath('//span/text()').re_first( u'Другие товары бренда (.+)') image_url = hxs.select('//img[@itemprop="image"]/@src').extract() if image_url: product['image_url'] = urljoin_rfc(base_url, image_url[0].strip()) product['sku'] = '' product['sku'] = response.xpath( u'//span[contains(., "Артикул:")]/following-sibling::span/text()' ).extract_first() product['category'] = hxs.select( '//div[contains(@class, "breadcrumbs")]//span/text()').extract( )[-2] product_loader = ProductLoaderWithoutSpaces(item=Product(), selector=hxs) for k, v in product.iteritems(): product_loader.add_value(k, v) product = product_loader.load_item() #time.sleep(random.random()*2.0) yield product
def parse_node(self, response, node): identifier = node.select('./*[local-name()="id"]/text()')[0].extract() if identifier not in self.id_code_map: return product_code = self.id_code_map[identifier] loader = ProductLoader(item=Product(), selector=node) size = node.xpath('./*[local-name()="size"]/text()').extract() color = node.xpath('./*[local-name()="color"]/text()').extract() material = node.xpath('./*[local-name()="material"]/text()').extract() name = node.xpath('./*[local-name()="parent_title"]/text()').extract() if not name: name = node.xpath('./title/text()').extract() name = name[0] if material: name += u' {}'.format(material[0]) if color: name += u' {}'.format(color[0]) if size: name += u' {}'.format(size[0]) price = node.xpath('./*[local-name()="price"]/text()').extract_first() pack_size = node.xpath('./description/text()').re( 'Pack Size m: *([\d.]+)') if pack_size: price = extract_price(price) * extract_price(pack_size[0]) loader.add_value('name', name) loader.add_xpath('url', './link/text()') loader.add_xpath('image_url', './*[local-name()="image_link"]/text()') loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_xpath( 'shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()') loader.add_xpath('brand', './*[local-name()="brand"]/text()') loader.add_xpath('category', './*[local-name()="google_product_category"]/text()') loader.add_xpath('sku', './*[local-name()="mpn"]/text()') stock = node.xpath('./*[local-name()="availability"]/text()').extract() if stock and stock[0] == 'out of stock': loader.add_value('stock', 0) item = loader.load_item() if product_code in self.cost_prices: try: cost_price = Decimal(self.cost_prices[product_code]) except: self.log('ERROR: unable to set cost price for item %r' % item) else: item['metadata'] = {'cost_price': str(cost_price)} if pack_size: yield Request(loader.get_output_value('url'), self.parse_pack_price, meta={'item': item}) else: yield item
def parse_treatment(self, response): base_url = get_base_url(response) product = response.xpath('//tr/td[(text()="Code")][1]')[0] identifiers = [] for size in product.xpath('./../following-sibling::tr[position()<5]'): loader = ProductLoader(item=Product(), selector=size) size_name = size.xpath('td[1]/text()').extract() loader.add_value('name', size_name) loader.add_xpath('sku', 'td[2]/text()') loader.add_xpath('price', 'td[3]/text()') if not loader.get_output_value('sku'): continue loader.add_xpath('identifier', 'td[2]/text()') loader.add_value('url', response.url) yield loader.load_item() else: self.treatment = True
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select( '//div[@class="product-tile"]//a/@href').extract(): pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = hxs.select('//h1/text()').extract() if not name: request = self.retry(response, "No name for product: " + response.url) if request: yield request return product_loader.add_value('name', name) category = hxs.select( '//ol[@class="breadcrumbs"]//a/text()').extract()[1:] product_loader.add_value('category', category) img = hxs.select('//div[@class="item"]//img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img.pop(0))) product = product_loader.load_item() options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li') if not options: options = hxs.select( u'//div[@class="SingColl"]/div[contains(@class, "Prod")]') if True: if not options or len(options) == 1: prod = Product(product) prod['sku'] = hxs.select('//div[@class="product-sku"]/text()' ).re('Product code: (\w+)').pop() prod['identifier'] = prod['sku'] prod['price'] = extract_price( hxs.select('//div[@class="price-current"]/text()').extract( ).pop()) if prod['identifier']: yield prod else: for opt in options: prod = Product(product) prod['name'] = opt.select( u'normalize-space(.//h2/text())').extract()[0] prod['sku'] = \ opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0] prod['identifier'] = prod['sku'] prod['price'] = extract_price( opt.select( u'.//span[@class="Price"]/text()').extract()[0]) yield prod
def parse_product(self, response): options_selects = response.css('label.required').xpath( '../following-sibling::dd[1]').css('div.input-box').xpath('*[1]') options_config = response.xpath('//script/text()').re_first( 'Product.Config.*?({.+})') if not options_selects: for item in self.parse_simple_product(response): yield item return options = [] for option in options_selects: if option.extract().startswith('<select'): if option.xpath('option[@value!=""]'): options.append(option.xpath('option[@value!=""]')) else: options.append(option.xpath('li')) if options_config: items = self.parse_product_options_config(response) else: items = self.parse_simple_product(response) for item in items: if not options: yield item continue variants = itertools.product(*options) for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] + '-' + '-'.join( (option.xpath('.//@value').extract_first() for option in variant)) loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) price = item['price'] for option in variant: name = option.xpath('text()').extract_first( ) or option.xpath('.//label/text()').extract_first() name = name.split(u'+£')[0] loader.add_value('name', name) price += Decimal(option.xpath('.//@price').extract_first()) loader.replace_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(selector=hxs, item=Product()) loader.add_xpath('name', '//h1/text()') loader.add_xpath( 'price', '//span[contains(@id, "price-including-tax")]/text()') stock = 1 if hxs.select('//span[text() = "In stock"]') else 0 loader.add_value('stock', stock) loader.add_xpath( 'category', '//div[@class="breadcrumbs"]//li[@class!="home"]/a//text()') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_xpath('sku', '//meta[@itemprop="sku"]/@content') loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@id="image-main"]/@src') yield loader.load_item()