def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('name', '//span[@id="productName"]//text()') loader.add_xpath('sku', '//span[@id="productEAN"]/text()[last()]') loader.add_xpath('category', '//div[@id="breadcrumb"]/ul/li[position()>1]/a/span/text()') loader.add_css('image_url', '.productImageItem ::attr(href)') brand = response.css('.brand ::text').extract_first() if brand != "null": loader.add_value('brand', brand) item = loader.load_item() p = re.compile('stockMatrix = (.+?);', re.DOTALL) data = response.xpath('//script/text()').re(p) options = json.loads(data[0]) for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value(None, item) opt_iter = iter(option) opt_name = '' for attribute in response.css('.skuAttribute'): opt_name = opt_iter.next() loader.add_value('name', opt_name) colour_url = response.xpath('//input[@class="colourImageUrl"][@name="%s"]/@value' %opt_name).extract_first() if colour_url: loader.replace_value('image_url', 'http://media.littlewoods.com/i/littlewoods/%s?$1064x1416_standard$' %colour_url) loader.replace_value('identifier', opt_iter.next()) stock = opt_iter.next() if stock.startswith('Unavailable'): continue loader.replace_value('stock', int('Out of stock' not in stock)) loader.replace_value('price', opt_iter.next()) yield loader.load_item()
def parse_product(self, response): identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_xpath('url', '//link[@rel="canonical"]/@href') category = response.css('.breadcrumb a::text').extract()[1:] loader.add_value('category', category) loader.add_value('brand', response.meta['brand']) loader.add_xpath('image_url', '//div/@data-original-img') loader.add_value('identifier', identifier) product = loader.load_item() if not response.css('.variations'): yield product return variations = response.xpath('//form/@data-product_variations').extract_first() variations = json.loads(variations) for variation in variations: variation_loader = ProductLoader(item=Product(product), response=response) attributes = variation['attributes'].values() variation_loader.replace_value('name', product['name']) for attribute in attributes: variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute) variation_loader.replace_value('price', variation['display_price']) variation_loader.replace_value('identifier', variation['variation_id']) yield variation_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()') brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"') loader.add_value('brand', brand) loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src') loader.add_value('url', url_query_cleaner(response.url)) loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value') item = loader.load_item() if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'): data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract() data = data.replace('\n', '').replace('null', '"null"') data = re.search('stockMatrix = (.*?);', data, re.DOTALL) data = json.loads(data.group(1)) if data else [] for i, variant in enumerate(data): sku = [elem for elem in variant if elem.startswith('sku')][0] sku_idx = variant.index(sku) product = Product(item) product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title() product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i) product['sku'] = product['identifier'] product['price'] = variant[sku_idx + 2] product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0 yield product return loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value') stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0 loader.add_value('stock', stock) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('\d\d\d\d', response.url).group(0) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()') category = response.css('.bread li a::text').extract()[1:] category += response.css('.bread li:last-child::text').extract() loader.add_value('category', category) image_url = response.css('.detimg a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() options = response.css('.tbl').xpath('.//*[@class="tr"]') if not options: item['price'] = 0 yield item return for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = option.xpath('.//input/@name').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) loader.replace_css('price', '.tc-price .pr-now::text') loader.add_css('price', '.tc-price::text') loader.replace_css('name', '.tc-title::text') yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') sku = response.xpath('//div[@itemprop="description"]/div/div[last()]/text()').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] category += response.css('.breadcrumbs li:last-of-type::text').extract() loader.add_value('category', category) image_url = response.css('img.gallery-main-image::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) if not response.css('.in-stock'): loader.add_value('stock', 0) item = loader.load_item() options = response.css('table.product-table tbody tr') for option in options: loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) sku = option.css('span.product-code::text').re('\((.+)\)')[0] name = option.css('span.product-name::text').extract_first() identifier = '-'.join((sku, hashlib.md5(item['name'] + name).hexdigest())) loader.replace_value('identifier', identifier) loader.replace_value('sku', sku) loader.add_css('price', 'span.product-price-rrp') price = option.css('td.product-price').xpath('text()[last()]').extract_first() loader.replace_value('price', price) if name not in item['name']: loader.add_value('name', name) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_xpath('price', '//h2[@itemprop="price"]/text()') category = response.xpath( '//div[@id="breadcrumbs"]/a/text()').extract() loader.add_value('category', category[1:-1]) image_url = response.css('img.productimage::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('shipping_cost', 10) loader.add_xpath('identifier', '//link[@rel="canonical"]/@href', re='\d+$') loader.add_xpath('sku', '//*/text()', re='Product code \#(.+)$') if response.xpath( "//*[contains(., 'SOLD OUT') or contains(., 'not available to buy online')]" ): loader.add_value('stock', 0) item = loader.load_item() options = response.xpath('//*[contains(@class, "sizeselect")]') if not options: yield item return for option in options: name = option.xpath('text()').extract_first() if not name: continue data = response.xpath('//span/text()[contains(., "size:%s")]' % name).extract_first().strip() sku = re.search('sku:(\d+)', data).group(1) if option.css('.sizeselectsoldout'): stock = 0 else: stock = re.search('qty:(\d+)', data).group(1) if not stock or not int(stock): stock = 1 loader = ProductLoader(Product(), response=response) loader.add_value(None, item) loader.add_value('name', name) loader.replace_value('identifier', sku) loader.replace_value('sku', sku) loader.replace_value('stock', stock) pr = loader.load_item() pr['metadata'] = {'size': name} yield pr
def parse_product(self, response): loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) sku = response.xpath('//input[@id="productSku"]/@value').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_xpath('brand', '//span[@itemprop="brand"]/text()') category = response.xpath('//div[@class="breadcrumbs"]//li/a/text()').extract()[-3:] loader.add_value('category', category) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//span[@id="price-displayed"]/text()') image_url = response.xpath('//a[@id="productImage"]/img/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) item = loader.load_item() attributes = response.xpath('//br/preceding-sibling::label[@for!="input-quantity"]/text()').extract() options = response.xpath('//tr[@itemprop="offers"]') headers = map(lambda x:x.lower(), response.xpath('//table[@id="variant-table"]//th/text()').extract()) attr_indexes = {headers.index(attr.lower()): attr for attr in attributes} if not options: yield item return for option in options: metadata = dict() option_name = [] for idx in sorted(attr_indexes): value = option.xpath('.//td')[idx].xpath('.//text()').re_first(' *\S+.+') if value: option_name.append(value.strip()) metadata[attr_indexes[idx]] = value.strip() loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) loader.add_value('name', option_name) loader.replace_xpath('price', './/span[@itemprop="price"]/text()') loader.add_value('price', 0) loader.replace_xpath('identifier', './/input[contains(@name, "VariantSku")]/@value') loader.replace_xpath('sku', './/input[contains(@name, "VariantSku")]/@value') option_item = loader.load_item() option_item['metadata'] = metadata yield option_item
def parse_product(self, response): if 'aspxerrorpath' in response.url: yield Request(response.request.meta['redirect_urls'][0], self.parse_product, dont_filter=True) return loader = ProductLoader(Product(), response=response) identifier = response.xpath('//@data-feefo-vendor-ref').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'header.page-title h1::text') loader.add_css('price', 'header.product-sidebar__price h2::text') loader.add_value('sku', identifier) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:-1]) image_url = response.css( '.product-gallery__main-image img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) stock = response.css('.product-sidebar__stock::text').extract_first() if not 'Order Now' in stock.title(): loader.add_value('stock', 0) item = loader.load_item() if 'Discontinued' in stock.title(): item['metadata'] = {"Discontinued?": "Yes"} option_types = response.css('.product-sidebar select') if not option_types: yield item return options = [] for option_type in option_types: options.append(option_type.xpath('option[@value!="Select"]')) variants = itertools.product(*options) for variant in variants: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) identifier = item['identifier'] for option in variant: loader.add_value('name', option.xpath('text()').extract()) identifier += '-' + option.xpath('@value').extract_first() loader.replace_value('identifier', identifier) loader.replace_value('sku', identifier) option_item = loader.load_item() option_item['metadata'] = item.get('metadata', {}) yield option_item
def parse_product(self, response): flix = '//script[@type="text/javascript"]/@data-flix-%s' name = response.xpath('//td/div[@align="center"]/b/text()').extract() if not name: return loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name[0].strip(' ,')) loader.add_value('url', response.url) identifier = filter(lambda s: bool(s.strip()), response.xpath(flix % 'ean').extract()) if not identifier or not identifier[0]: identifier = response.xpath( '//b[contains(text(), "Model :")]/../text()[1]').extract() sku = response.xpath(flix % 'mpn').extract() if not sku or not sku[0]: sku = response.xpath( '//b[contains(text(), "Model")]/../text()[1]').extract() loader.add_value('identifier', identifier) loader.add_value('sku', sku) price = re.findall(u'POST.+?> *€(.+?) *<', response.body) loader.add_value('price', price) loader.add_xpath('category', '//h8//a[position()>1]/text()') loader.add_xpath('brand', flix % 'brand') stock = response.xpath( '//button[@value="Central Warehouse"]/../text()').extract_first() if not stock or 'Available' not in stock: loader.add_value('stock', 0) item = loader.load_item() if response.xpath('//img[@alt="Exdisplay"]'): item['metadata'] = {'Ex Display': 'Ex Display'} yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) xpath = '//div[@class="nosto_product"]/span[@class="%s"]/text()' if not response.xpath('//div[@class="nosto_product"]'): for product in self.parse_category(response): yield product return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) for s in ('name', 'price', 'image_url', 'brand'): loader.add_xpath(s, xpath % s) loader.add_xpath('identifier', xpath % 'product_id') loader.add_xpath('sku', '//h6[@class="product-model"]/text()') category = hxs.select(xpath % 'category').extract() if category: category.sort() loader.add_value('category', category[-1].strip('/').split('/')) loader.add_value('shipping_cost', 29.99) if 'InStock' not in hxs.select(xpath % 'availability').extract(): loader.add_value('stock', 0) item = loader.load_item() if 'Ex Display' in item['name']: item['metadata'] = {'Ex Display': 'Ex Display'} yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) data = hxs.select( '//script[@type="text/javascript"]/text()[contains(., "window.universal_variable")]' ).extract()[0] data = data.replace('\r\n', '') data = re.findall('window.universal_variable = ({.+})', data)[0] data = json.loads(data) product = data['product'] loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', product['url']) loader.add_value('name', product['name']) loader.add_value('price', product['unit_price']) loader.add_value('identifier', product['sku_code']) loader.add_value('sku', product['id']) loader.add_value('stock', int(product['stock'])) loader.add_value('category', data['page']['breadcrumb'][1:-1]) loader.add_value( 'image_url', urljoin( base_url, hxs.select('//a[@id="ctl00_con1_ctl00_prodimg1_imglnk1"]/@href' ).extract()[0])) item = loader.load_item() if item['price'] < 30: item['shipping_cost'] = 3.50 yield item for url in hxs.select('//option/@value').extract(): yield Request(url, callback=self.parse_product)
def parse_doors(self, response): url = response.xpath('//link[@rel="canonical"]/@href').extract() category = response.xpath( '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract() ids = response.xpath('//script/text()').re('ecomm_prodid.*(\[.+\])') ids = eval(ids[0]) for i, product in enumerate( response.xpath('//div[@itemprop="offers"]')): loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@itemprop="name"]/a/text()[1]') loader.add_value('identifier', ids[i]) loader.add_value('sku', ids[i]) loader.add_xpath('price', './/span[@itemprop="price"]/text()') local_url = product.xpath( './/h3[@itemprop="name"]/a/@href').extract() if local_url: local_url = response.urljoin(local_url[0]) else: local_url = url loader.add_value('url', local_url) image_url = product.xpath('.//a/img/@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('category', category) if not product.xpath( 'link[@itemprop="availability"][@href="http://schema.org/InStock"]' ): loader.add_value('stock', 0) if loader.get_output_value('price') < 750: loader.add_value('shipping_cost', 36) yield loader.load_item()
def parse_product_base(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_xpath = '//div[@id="image-block"]//img[@itemprop="image"]/@src' breadcrumb = response.css('div.breadcrumb a span::text').extract() if len(breadcrumb) > 0: category = breadcrumb.pop().strip() else: category = '' try: name = response.css('div.primary_block h1::text').extract_first().strip() except: return product_brand = '' for brand in self.brands: if brand.lower() in category.lower() or name.lower().startswith(brand.lower()): product_brand = brand break allow_buy_out_stock = re.search('var allowBuyWhenOutOfStock = true;', response.body) image = hxs.select(image_xpath).extract().pop() product_url = urljoin_rfc(base_url, response.url) image_url = urljoin_rfc(base_url, image) # "var quantityAvailable = 7" means there are in total 7 products available in stock quantity = re.search('var quantityAvailable\D+(\d+)', response.body) product_id = re.search('var id_product\D+(\d+)', response.body) price = response.xpath('//span[@id="our_price_display"]//text()').extract() if price: price = price.pop() else: price = '0.00' loader = ProductLoader(response=response, item=Product()) loader.add_value('url', product_url) loader.add_value('name', name) loader.add_value('brand', product_brand) loader.add_value('image_url', image_url) loader.add_value('price', price.replace(' ', '').replace(',', '.')) loader.add_value('category', category) loader.add_xpath('sku', '//p[@id="product_reference"]/span/text()') if product_id: loader.add_value('identifier', product_id.group(1)) else: loader.add_xpath('identifier', '//form//input[@name="id_product"]/@value') stock = response.xpath('//span[@id="availability_value"]/text()').extract_first() if stock and stock.title() != 'In Stock': loader.add_value('stock', 0) return loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = filter( lambda p: p.strip(), hxs.select("//span[@class='regular-price']//text()").extract())[1:] loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', "//div[@class='product-name']//h1//text()") loader.add_xpath( 'category', "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()" ) brand = hxs.select( "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()" ).extract() loader.add_value('brand', brand) loader.add_value('shipping_cost', 0) loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()') loader.add_xpath( 'identifier', "//div[@class='product-view']//input[@name='product']/@value") image_urls = hxs.select( '//img[contains(@class, "gallery-image")]/@src').extract() for image_url in image_urls: if len(image_url) < 1024: loader.add_value('image_url', image_url) break product = loader.load_item() if product['price'] > 0: yield product
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username = username, password = password) sftp = paramiko.SFTPClient.from_transport(transport) file_path = os.path.join(HERE, 'bearmach_products.csv') sftp.get('bearmach_feed.csv', file_path) with open(file_path) as f: reader = csv.DictReader(f) reader.fieldnames = [field.strip() for field in reader.fieldnames] for row in reader: loader = ProductLoader(Product(), response=None) loader.add_value('identifier', row['Bearmach Part Number'].decode('latin-1')) loader.add_value('sku', row['Bearmach Part Number'].decode('latin-1')) loader.add_value('name', row['Description'].decode('latin-1')) loader.add_value('brand', row['Brand'].decode('latin-1')) loader.add_value('price', row['Retail'].decode('latin-1')) loader.add_value('category', row['Product Group']) item = loader.load_item() metadata = BearmachMeta() metadata['cost_price'] = str(extract_price(row['Cost'].decode('latin-1'))) metadata['supplier_code'] = row['Supplier Code'].strip() metadata['supplier_name'] = row['Supplier Name'].strip() item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) name = ' '.join(response.xpath('//div[@itemprop="name"]/*//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) image_url = response.xpath('//img[@class="left-image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) price = response.xpath('//div[@itemprop="offers"]/p[@class="box-price"]/b/text()').extract() if not price: price = response.xpath('//div[@itemprop="offers"]/span[@itemprop="price"]/text()').extract() loader.add_value('price', price) brand = response.xpath('//img[@class="brand"]/@alt').extract() if not brand: brand = response.xpath('//div[@itemprop="name"]/h1/text()').extract() if brand and not brand[0].isdigit(): loader.add_value('brand', brand) sku = response.xpath('//input[@type="hidden" and @name="productIdAnalytics"]/@value').extract() loader.add_value('sku', sku) loader.add_value('identifier', sku) item = loader.load_item() metadata = SpecSaversMeta() metadata['promotion'] = response.meta['promotional_data'] item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css('span#thisstkcode::text').extract_first() if not identifier: retries = response.meta.get('retries', 0) if retries > 9: self.logger.warning('No identifier found on %s' % response.url) else: self.logger.debug('Retry %s to get identifier' % response.url) meta = response.meta meta['retries'] = retries + 1 yield response.request.replace('dont_filter=True', meta=meta) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') price = response.css('span.prodPrice').xpath( './/span[@itemprop="price"]/text()').extract_first() loader.add_value('price', price) category = response.css('.breadcrumbs span::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', '.main-product-photo::attr(href)') loader.add_css('brand', 'span#thisbrand::text') loader.add_css('stock', 'input#data-stock-qty::attr(value)') yield loader.load_item()
def parse_node(self, response, node): loader = ProductLoader(item=Product(), selector=node) size = node.xpath('./*[local-name()="size"]/text()').extract() color = node.xpath('./*[local-name()="color"]/text()').extract() material = node.xpath('./*[local-name()="material"]/text()').extract() name = node.xpath('./*[local-name()="parent_title"]/text()').extract() if not name: name = node.xpath('./title/text()').extract() name = name[0] if material: name += u' {}'.format(material[0]) if color: name += u' {}'.format(color[0]) if size: name += u' {}'.format(size[0]) loader.add_value('name', name) loader.add_xpath('url', './link/text()') loader.add_xpath('image_url', './*[local-name()="image_link"]/text()') loader.add_xpath('identifier', './*[local-name()="id"]/text()') loader.add_xpath('price', './*[local-name()="price"]/text()') loader.add_xpath('shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()') loader.add_xpath('brand', './*[local-name()="brand"]/text()') loader.add_xpath('category', './*[local-name()="google_product_category"]/text()') loader.add_xpath('sku', './*[local-name()="mpn"]/text()') stock = node.xpath('./*[local-name()="availability"]/text()').extract() if stock and stock[0] == 'out of stock': loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) name = response.xpath('//h1[@class="product-view__title"]/span/text()').extract() name = map(lambda x: x.strip(), name) name = ' '.join(name) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_xpath('price', '//div[contains(@class, "product-view__total-price")]/@data-price') image_url = response.xpath('//img[@itemprop="image"]/@alt').extract() if image_url: loader.add_value('image_url', 'http:' + image_url[0]) loader.add_xpath('brand', '//div[@class="product-view__brand brand"]/img[@class="brand__image"]/@alt') loader.add_value('category', 'Kontaktlinser') loader.add_value('url', response.url) identifier = re.findall('"ecomm_prodid":"(\d+)","', response.body)[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) metadata = SpecSaversMeta() promotion = response.xpath('//section[contains(@class, "product-view--product-page")]//figcaption[@class="splash__inner"]//text()').extract() if promotion: promotion = [s for s in map(lambda x: x.strip(), promotion) if s != ''] promotion = ' '.join(promotion) else: promotion = '' metadata['promotion'] = promotion item = loader.load_item() item['metadata'] = metadata yield item
def parse_product(self, response): product = re.findall('"products":(.*)}}}', response.body) if product: product = json.loads(product[0])[0] loader = ProductLoader(item=Product(), response=response) name = response.xpath( '//div[contains(@class,"field-name-title")]/h1/text()' ).extract() name += response.xpath( '//div[contains(@class,"field-name-field-cl-lens-type")]/div/span/text()' ).extract() name += response.xpath( '//div[contains(@class,"form-item-cl-supply")]/text()' ).extract() loader.add_value('name', u' '.join([x.strip() for x in name])) loader.add_value('identifier', response.url.split('/')[-1]) loader.add_value('url', response.url) loader.add_value('brand', product['brand']) loader.add_value('category', product['category']) image_url = response.xpath( '//img[contains(@class, "img-responsive")]/@src').extract() if image_url: loader.add_value('image_url', image_url) loader.add_value('price', product['price']) yield loader.load_item()
def parse_product(self, response): brand = response.meta['brand'] brands = response.meta['brands'] loader = ProductLoader(Product(), response=response) sku_searched = response.meta['sku'] sku = response.css('.part-number strong::text').extract_first() if not sku or sku.strip().upper() != sku_searched: return product_brand = response.xpath( '//tr[th[contains(text(), "Brand")]]/td[contains(@class, "data")]/text()' ).extract()[0] if product_brand.upper().strip() not in brands: return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_css('name', '.product-name .h1::text') loader.add_xpath( 'price', '//span[contains(@id, "price-excluding-tax")]/text()') loader.add_value('sku', sku) category = response.css('.breadcrumbs a::text').extract()[1:] loader.add_value('category', category) loader.add_css('image_url', 'img#image-main::attr(src)') loader.add_value('brand', brand) if response.css('.availability .out-of-stock'): loader.add_value('stock', 0) item = loader.load_item() if item['price'] < 50: item['shipping_cost'] = 5 yield item
def parse_product(self, response): loader = ProductLoader(Product(), response=response) loader.add_xpath('identifier', '//input[@name="productid"]/@value') loader.add_value('url', response.url) loader.add_css('name', '.descr::text') loader.add_css('price', 'span.currency::text') loader.add_value('sku', response.meta['sku']) image_url = response.css( 'img#product_thumbnail::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', response.meta['brand']) stock = response.css('.quantity script::text').re( 'product_avail = (\d+);')[0] loader.add_value('stock', stock) item = loader.load_item() if stock == '0': yield item return request = FormRequest.from_response(response, formname='orderform', meta={ 'cookiejar': item['identifier'], 'item': Product(item) }, cookies=self.cookies, callback=self.parse_shipping, dont_filter=True) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) # identifier = hxs.select('').extract() sku = hxs.select('//p/span[@itemprop="sku"]/text()').extract() identifier = sku if not sku: identifier = response.url.split('/')[-1].split('.')[0] loader.add_value('identifier', identifier) loader.add_value('sku', sku) if identifier in self.seen_ids: return self.seen_ids.append(identifier) name = hxs.select('//h1[@class="first"]/span[@itemprop="name"]/text()' ).extract()[0].strip() try: loader.add_value('name', name) except: loader.add_value('name', name.decode('utf-8', 'replace')) category = hxs.select('//ol[@class="breadcrumb"]//a/text()').extract() loader.add_value('category', ' > '.join(category[1:][-3:])) image_url = hxs.select('//a[@class="lightbox"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('url', response.url) price = hxs.select( '//span[@class="price-big orange"]/text()').extract()[0] loader.add_value('price', price) if not loader.get_output_value('price'): loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="detailstitle"]/text()') loader.add_xpath('identifier', '//script/text()', re="'productID':'(\w+?)'") loader.add_xpath('sku', '//script/text()', re="'productID':'(\w+?)'") loader.add_value('url', response.url) loader.add_xpath('price', '//script/text()', re="'productValue':'([\d\.]+?)'") loader.add_xpath('category', '//div[@class="breadcrumb"]/a[position()>1]/text()') image_url = response.xpath( '//div[@class="mainProductImage"]//img/@src').extract() if not image_url: image_url = response.xpath( '(//div[@class="thumbnail"])[2]//input[@type="image"]/@src' ).extract() image_url = [image_url[0].replace('XSmall', 'Large')] if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_xpath( 'brand', '(//td[contains(h5/text(), "Brand")])[1]/following-sibling::td[1]/span/text()' ) if not response.xpath( '//div[@id="availDelTick"]//a[@class="BasketTickOn"]'): loader.add_value('stock', 0) yield loader.load_item()
def parse_products(self, response): products = response.xpath( '//div[@class="productListItem"]/div[@class="productListLink"]/a/@href' ).extract() for url in products: req = Request(response.urljoin(url), dont_filter=True, callback=self.parse_product, meta=response.meta) req.headers['Set-Cookie'] = self.cookies yield req if not products: row = response.meta['row'] loader = ProductLoader(item=Product(), response=response) loader.add_value('url', '') name = row['Product Description'] + ' ' + row[ 'Size Description'] + ' ' + row['HERO NAME'] + ' ' + row[ 'HERO NUMBER'] loader.add_value('name', name) loader.add_value('image_url', '') loader.add_value('category', '') loader.add_value('brand', row['Merret Department']) loader.add_value('price', 0) loader.add_value('stock', 0) loader.add_value('identifier', row['SKU VID'].decode('unicode_escape')) loader.add_value('sku', row['SKU VID'].decode('unicode_escape')) item = loader.load_item() yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = response.xpath( '//div[@class="breadcrumbs"]//a/span/text()').extract()[1:] identifier = hxs.select('//input[@name="product"]/@value').extract()[0] image_url = hxs.select( '//div[@class="product-img-box"]/a[@id="main-image"]/img/@src' ).extract() name = normalize_name(hxs.select('//h1/text()').extract()[0]) price = "".join( hxs.select( '//div[@class="product-view"]//div[@class="price-box"]//span[contains(@id, "price-including-tax-")]//text()' ).extract()).replace(',', '.').replace(u'\xa0', "").strip() sku = hxs.select('//*[@itemprop="sku"]/text()').extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) loader.add_value('url', response.url) if category: loader.add_value('category', category[0]) if image_url: loader.add_value('image_url', image_url[0]) loader.add_value('stock', 1) item = loader.load_item() if not item['identifier'] in self.identifiers_collected: self.identifiers_collected.add(item['identifier']) yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) box = hxs.select('//div[@class="prod-box"]') crumbs = hxs.select('//ul[@class="breadcrumbs"]')[0] loader = ProductLoader(selector=box, item=Product()) loader.add_value('url', response.url) brand = crumbs.select('.//a[contains(text(), "Brands")]/../following-sibling::li[1]/a/text()').extract() loader.add_value('brand', brand) categories = crumbs.select('.//a/text()').extract() categories = [cat for cat in categories if "Brand" not in cat] loader.add_value('category', categories) image_url = hxs.select('//section[@id="one"]//@src').extract() if not image_url: yield Request(response.url, callback=self.parse_category, dont_filter=True) return loader.add_value('image_url', urljoin(base_url, image_url[0])) loader.add_xpath('name', './h1/text()') loader.add_xpath('identifier', '//*/@prodref') loader.add_xpath('sku', '//*/@prodref') if not box.select('//*[text()="In Stock" or text()="Low Stock"]'): loader.add_value('stock', 0) loader.add_xpath('price', './/span[@class="product-price"]/text()') product = loader.load_item() if product['price'] < 20: product['shipping_cost'] = 2 elif product['price'] < 40: product['shipping_cost'] = 4.99 yield product
def parse(self, response): response.selector.register_namespace("g", "http://base.google.com/ns/1.0") for item in response.xpath('//item'): image_url = item.xpath('g:image_link/text()').extract() image_url = image_url[0] if image_url else '' category = item.xpath('g:product_type/text()').extract() category = category[0].split('>')[1:] if category else '' brand = item.xpath('g:brand/text()').extract() identifier = item.xpath('g:id/text()').extract() name = item.xpath('title/text()').extract_first() if name: name = name.replace('...', '').strip() price = item.xpath('g:price/text()').extract() price = extract_price(price[0]) if price else 0 url = item.xpath('link/text()').extract()[0] out_of_stock = item.xpath( 'g:availability/text()').extract()[0] == 'out of stock' product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) product_loader.add_value('name', name) product_loader.add_value('image_url', image_url) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) if out_of_stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath('//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' %(i+1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' %(i+1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re('AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item