def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: product_price = extract_price(row['SalePrice']) if not product_price: product_price = extract_price(row['Price']) product_name = ' - '.join([ row['PaperType'], row['LaminationType'], row['PrintType'], row['PaperSize'], row['FoldingType']]) product_identifier = row['ProductID'] if row['ProdRange'] != 'NULL': product_category = row['ProductType'] + ' - ' + row['ProdRange'] else: product_category = row['ProductType'] loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('price', product_price) loader.add_value('category', product_category) item = loader.load_item() item['metadata'] = row.copy() yield item
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta loader = ProductLoader(response=response, item=Product()) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_xpath('sku', '//h3[@class="sku-number"]/text()') loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = hxs.select( '//p[@class="special-price"]/span[@class="price"]/text()').extract( ) if price: price = extract_price(price[0]) else: price = hxs.select('//span[@class="price"]/text()').extract() if price: price = extract_price(price[0]) else: price = 0 loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@id="main-image-image"]/@src') loader.add_xpath('brand', '//a[@itemprop="brand"]/text()') category = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[1] loader.add_value('category', meta['category']) loader.add_value('shipping_cost', 25) sold_out = hxs.select('//button[@class="btn-cart soldout"]') if sold_out: loader.add_value('stock', 0) yield loader.load_item()
def parse_dealers(self, response): item = response.meta['item'] try: hxs = HtmlXPathSelector(response) dealers = hxs.select('//div[@class="product-list" and @data-condition="new"]') except Exception: dealers = [] if not dealers and response.meta['one_seller']: log.msg('ERROR >>> ONE SELLER: ' + item['url']) return for dealer in dealers: dealer_name = ''.join(dealer.select('.//div[@class="seller-name"]/span/text()').extract()).strip() if dealer_name.upper() == 'BEST BUY': log.msg('INFO >>> COLLECT BEST BUY ITEM: ' + item['url']) out_of_stock = dealer.select('.//div[@class="cart-button" and @data-button-state-id="SOLD_OUT_ONLINE"]') if out_of_stock: item['stock'] = 0 price = dealer.select('.//div[@class="medium-item-price"]//text()').extract() if not price: log.msg('ADD TO CART PRICE >>> ' + item['url']) price = dealer.select('@data-price').extract() item['price'] = extract_price(price[-1]) shipping_cost = dealer.select('.//div[@class="shipping-cost-puck"]//text()').extract() if shipping_cost: item['shipping_cost'] = extract_price(shipping_cost[0]) break if item['identifier']: self.new_ids.append(item['identifier']) yield item
def parse_product(self, response): # inspect_response(response, self) # return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) tmp = hxs.select( '//div[@class="breadcrumbs"]/ul/li[contains(@class,"category")]/a/text()' ).extract() if tmp: for s in tmp: loader.add_value('category', s) p = loader.load_item() product = response.meta['item'] product['category'] = p['category'] identifier = product['identifier'] tmp = hxs.select( '//div[@class="breadcrumbs"]/ul/li[@class="product"]/strong/text()' ).extract() if tmp: product['name'] = tmp[0] name = product['name'] price = hxs.select('//div[@class="price"]/span/text()').extract() if not product['price'] and price: product['price'] = extract_price(price[0]) options = hxs.select( '//select[@id="simple-selection"]/option[not(@value="null")]') if not options: tmp = hxs.select( '//div[@id="product-options"]//input[@id="sku-code"]/@value' ).extract() if tmp: product['sku'] = tmp[0] tmp = hxs.select( '//form[@id="product_addtocart_form"]/@action').extract() if tmp and '/product/' in tmp[0]: product['identifier'] = tmp[0].split('/product/', 1)[1].split('/', 1)[0] yield product return for sel in options: # ## item = copy.deepcopy(product) tmp = sel.select('text()').extract() if tmp: item['name'] = name + ' - ' + tmp[0] tmp = sel.select('@data-sku').extract() if tmp: item['identifier'] = identifier + '-' + tmp[0] item['sku'] = tmp[0] tmp = sel.select('@value').extract() if tmp: item['identifier'] = tmp[0] tmp = sel.select('@data-simple-price').extract() if tmp: price = round(extract_price(tmp[0]), 2) item['price'] = price yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) brand = response.meta.get('brand', '') category = response.meta.get('category', '') sku = hxs.select('//input[@name="product_sku"]/@value').extract().pop() identifier = sku name = hxs.select('//h1[@class="gf_1"]/text()').extract() price = hxs.select('//span[@itemprop="price"]/text()').extract().pop() price = extract_price(price) # VAT price_vat = extract_price(str(float(price)*1.2)) #image_url image_url = hxs.select('//img[@id="main_image"]/@src').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin(response.url, image_url.pop())) if brand: loader.add_value('brand', brand) if category or brand: loader.add_value('category', category or brand) loader.add_value('name', name) loader.add_value('price', price_vat) if price < 50.00: loader.add_value('shipping_cost', '5.00') loader.add_value('sku', sku) loader.add_value('identifier', identifier) #Stock #loader.add_value('stock', stock[0].strip()) if sku not in self.ids_seen: self.ids_seen.add(sku) yield loader.load_item()
def parse_shipping(self, response): shipping_cost, free_shipping_over = response.xpath(u'//span[contains(text(),"FREE ECONOMY SHIPPING FOR ORDERS OVER $65.00")]/text()')\ .re(u'\((.*?) for orders less than (.*)\)') self.shipping_cost = extract_price(shipping_cost) self.free_shipping_over = extract_price(free_shipping_over) for url in self.start_urls: yield scrapy.Request(url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) meta = response.meta sku = hxs.select('//p[@class="alignright"]/text()').extract()[0].replace('[', '').replace(']', '') category = hxs.select('//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[-1] name = response.meta.get('name') price_box = hxs.select("//div[@itemprop='offers']//*[contains(text(),'HT')]/text()").extract() if price_box: price_box = ''.join(price_box[0].split()) price = re.findall(re.compile("[^0-9]*([0-9 .,]+).*"), price_box)[0].strip() price = extract_price(price) tax = hxs.select('//span[@class="weee"]/small/text()').extract() tax = extract_price(tax[0]) if tax else 0 if not name: name = hxs.select('//div[@class="product-name"]/h1[@itemprop="name"]/text()').extract() l = ProductLoader(item=Product(), response=response) l.add_xpath('identifier', '//form[@id="product_addtocart_form"]//input[@name="product"]/@value') l.add_value('name', name) l.add_value('category', category) l.add_xpath('brand', '//div[@class="product-manufacturer"]/a/@title') l.add_value('sku', sku) l.add_value('url', response.url) l.add_value('price', price + tax) l.add_value('stock', 1) l.add_xpath('image_url', '//p[@class="product-image"]/a/img/@src') yield l.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = hxs.select('//*[@id="buyingOptions"]/dl/dd/ul/li') log.msg('PARSE PRODUCT') base_product = response.meta['product'] used_items = hxs.select('.//li[contains(@id,"usedItem")]') if options: for option in options: log.msg('PARSE PRODUCT OPTIONS') product_option = Product(base_product) full_name = ' '.join( (base_product['name'], ''.join(option.select('div/strong/a/text()').extract()))) option_values = json.loads( option.select(".//var/text()").extract()[0]) sku = option_values['sku'] product_option['name'] = full_name product_option['sku'] = sku product_option['identifier'] = sku price = ''.join( option.select( 'div/p/span[@class="priceVal"]/text()').extract()) if not price: try: price = json.loads( option.select( 'var[contains(@class, "styleInfo")]/text()'). extract().pop())['price'] except KeyError: return product_option['price'] = extract_price(str(price)) if product_option['price'] > 0: yield product_option else: if base_product['price'] > 0: yield base_product if used_items: for used_item in used_items: product_option = Product(base_product) full_name = ' '.join((base_product['name'], '(Used,', ''.join( used_item.select( './/p[contains(@class,"usedCondition")]/text()'). extract()), ')')).replace('\n', ' ') sku = used_item.select('.//fieldset/a/@href').re( 'url_catalog_ref_id=(.*?)&')[0] product_option['name'] = full_name product_option['sku'] = sku product_option['identifier'] = sku price = used_item.select( './/p[contains(@class,"usedPrice")]/text()').extract() price = ''.join(price).replace('\n', '') decimal_price = used_item.select( './/p[contains(@class,"usedPrice")]/sup[@class="decimalPrice"]/text()' )[0].extract() price = '.'.join([price, decimal_price]) product_option['price'] = extract_price(str(price)) if product_option['price'] > 0: yield product_option
def parse_product(self, response): sku = response.meta['sku'] brands = response.meta['brands'] loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="pid"]/@value').extract() if identifier: product_brand = response.meta.get('product_brand', None) if not product_brand: product_brand = re.findall('BRAND:</b> (.*)<br><b>W', response.body) if product_brand: product_brand = product_brand[0].strip() else: self.log('>>> ERROR: No brand found: ' + response.url) return product_code = response.xpath('//span[@itemprop="name"]/text()' ).re('(.*) : ')[0].strip() if product_brand.upper( ) not in brands or product_code.upper() != sku.upper(): return identifier = identifier[0] name = response.xpath( '//span[@itemprop="name"]/text()').extract()[0].strip() price = response.xpath( '//font[@class="selling_price"]/b/text()').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('sku', sku) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('brand', product_brand) weight = response.xpath( '//p[b[contains(text(), "Weight or Volume")]]/span/text()' ).extract() if weight: weight = weight[0].upper() # convert price to grams if it is in KG if 'KG' in weight: weight = extract_price(weight) * 1000 else: weight = extract_price(weight) if weight > 1000: loader.add_value('shipping_cost', 4.99) else: loader.add_value('shipping_cost', 3.50) item = loader.load_item() yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') product_loader.add_value('category', response.meta.get('category')) img = hxs.select(u'//img[@id="gallery-image"]/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) product_loader.add_xpath( 'brand', u'//div[@class="manufacturer-logo"]/a/img/@alt') product = product_loader.load_item() for opt in hxs.select( u'//div[contains(@class,"purchase-options")]/div/form'): prod = Product(product) prod['name'] = prod['name'] + ' ' + opt.select( u'.//span[@class="option"]/text()').extract()[0].strip() prod['price'] = extract_price( opt.select( u'.//input[contains(@id, "-base-sale-price")]/@value'). extract()[0]) prod['sku'] = opt.select( u'.//input[@name="product-id"]/@value').extract()[0] prod['identifier'] = opt.select( u'.//input[@name="product-id"]/@value').extract()[0] opt_groups = [] for select in opt.select( u'.//select/../../label[not(contains(text(),"Delivery"))]/../div/select' ): opts = [] import logging for o in select.select( u'./option[not(contains(text(), "None"))]'): option = ''.join(o.select('.//text()').extract()) id = o.select('./@value').extract()[0] try: logging.error(option) name, price = option.split('(') price = extract_price(price) except: name, price = option, 0 opts.append((price, name, id)) opt_groups.append(opts) for opt_price, opt_name, opt_id in multiply(opt_groups): p = Product(prod) p['name'] = p['name'] + ' ' + opt_name p['price'] = p['price'] + Decimal(opt_price).quantize( Decimal('1.00')) p['identifier'] = p[ 'identifier'] + ':' + opt_id if opt_id else p[ 'identifier'] + '-' yield p
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath( 'name', '//h1/span[@class="name-name" or @class="name-range" or @class="name-size"]/text()' ) product_loader.add_xpath('brand', u'//h1/a[@class="name-brand"]/text()') product_loader.add_xpath('sku', '//meta[@name="esc-sku"]/@content') product_loader.add_xpath( 'category', '//div[@class="breadcrumbs"]/ul/li[position() > 1 and position() < last()-1]//a/text()' ) img = hxs.select( u'//div[@class="product-image-wrapper"]//img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) item = product_loader.load_item() metadata = FragranceDirectMeta() metadata['promotion'] = normalize_space(' '.join( hxs.select( '//div[@class="bubble-msg-container"]//text()').extract())) if item.get('price'): metadata['price_exc_vat'] = Decimal( product['price']) / Decimal('1.2') item['metadata'] = metadata for opt in hxs.select( '//table[@id="super-product-table"]//tbody/tr[not(contains(@class,"gwp"))]' ): p = Product(item) name = normalize_space(''.join( opt.select('./td[2]/text()').extract())) self.log("NAE %s" % name) if name not in p['name']: p['name'] = normalize_space(p['name'] + ' ' + name) p['identifier'] = opt.select( 'normalize-space(substring-after(.//div[@class="product-code"]/text(), "#"))' ).extract()[0] p['price'] = extract_price(''.join( opt.select('.//span[starts-with(@id,"product-price-")]//text()' ).extract())) if p['price'] < 30: p['shipping_cost'] = extract_price('1.95') p['price'] = p['price'] + p['shipping_cost'] if p['price']: p['metadata']['price_exc_vat'] = Decimal( p['price']) / Decimal('1.2') p['stock'] = 'in stock' in ''.join( opt.select('.//span[@class="stock-status-main"]/text()'). extract()) and 1 or 0 yield p
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = None js_line = '' for l in response.body.split('\n'): if 'variants:' in l: js_line = l break if js_line: options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip()) product_loader = ProductLoader(item=Product(), selector=hxs) row = response.meta['row'] sku = row['PRODUCT_NUMBER'] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) product_loader.add_value('url', response.url) name = hxs.select('//span[@itemprop="name"]/text()').extract()[0] product_loader.add_value('name', name) category = hxs.select('//*[@id="breadcrumb"]//a/text()').extract()[1:-1] product_loader.add_value('category', category) img = hxs.select('//meta[@property="og:image"]/@content').extract() if img: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img.pop())) price = hxs.select('//p[@class="productOfferPrice"]/text()').extract()[0] price = extract_price(price) product_loader.add_value('price', price) brand = hxs.select('//*[@id="brandHeader"]/a/@href').extract() if brand: brand = brand[0].replace('/en/', '')[:-1] product_loader.add_value('brand', brand) stock = ''.join(hxs.select('//div[@class="cvos-availbility-panel"]/p/text()').extract()) if 'Item is currently out of stock online' in stock: product_loader.add_value('stock', 0) product = product_loader.load_item() metadata = BootsMeta() prom = ''.join(hxs.select('//div[@class="productSavings"]//text()').extract()) metadata['promotion'] = prom + ' ' + ''.join(hxs.select('//div[@class="primaryItemDeal"]//p/text()').extract()) if product['price']: metadata['price_exc_vat'] = Decimal(product['price']) / Decimal('1.2') product['metadata'] = metadata yield product if options: for k, val in options.items(): option_name = k.replace('_', ' ') option_product = Product(product) option_product['name'] = product['name'] + ' ' + option_name option_product['sku'] = val['productCode'] option_product['identifier'] = val['variantId'] option_product['price'] = extract_price(val['nowPrice']) if option_product.get('price'): option_product['metadata']['price_exc_vat'] = Decimal(option_product['price']) / Decimal('1.2') yield option_product
def parse_node(self, response, node): identifier = node.select('./*[local-name()="id"]/text()')[0].extract() if identifier not in self.id_code_map: return product_code = self.id_code_map[identifier] loader = ProductLoader(item=Product(), selector=node) size = node.xpath('./*[local-name()="size"]/text()').extract() color = node.xpath('./*[local-name()="color"]/text()').extract() material = node.xpath('./*[local-name()="material"]/text()').extract() name = node.xpath('./*[local-name()="parent_title"]/text()').extract() if not name: name = node.xpath('./title/text()').extract() name = name[0] if material: name += u' {}'.format(material[0]) if color: name += u' {}'.format(color[0]) if size: name += u' {}'.format(size[0]) price = node.xpath('./*[local-name()="price"]/text()').extract_first() pack_size = node.xpath('./description/text()').re( 'Pack Size m: *([\d.]+)') if pack_size: price = extract_price(price) * extract_price(pack_size[0]) loader.add_value('name', name) loader.add_xpath('url', './link/text()') loader.add_xpath('image_url', './*[local-name()="image_link"]/text()') loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_xpath( 'shipping_cost', './*[local-name()="shipping"]/*[local-name()="price"]/text()') loader.add_xpath('brand', './*[local-name()="brand"]/text()') loader.add_xpath('category', './*[local-name()="google_product_category"]/text()') loader.add_xpath('sku', './*[local-name()="mpn"]/text()') stock = node.xpath('./*[local-name()="availability"]/text()').extract() if stock and stock[0] == 'out of stock': loader.add_value('stock', 0) item = loader.load_item() if product_code in self.cost_prices: try: cost_price = Decimal(self.cost_prices[product_code]) except: self.log('ERROR: unable to set cost price for item %r' % item) else: item['metadata'] = {'cost_price': str(cost_price)} if pack_size: yield Request(loader.get_output_value('url'), self.parse_pack_price, meta={'item': item}) else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1//text()') product_loader.add_xpath( 'category', u'normalize-space(//a[@class="Link21" and position()=2]/text())') product_loader.add_xpath( 'brand', u'//a[@id="ModelsDisplayStyle4_HlkSeeAllBrandProducts"]/@title') img = hxs.select(u'//div[@id="DivModelImage"]/a/@href').extract() if not img: img = hxs.select( u'//div[@id="DivModelImage"]/a/img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img[0])) product_loader.add_xpath('brand', u'//span[@itemprop="manufacturer"]/text()') # product_loader.add_xpath('shipping_cost', '') product = product_loader.load_item() for option in hxs.select( u'//div[@id="TabContentAddToBasketTab"]//tr[@class="BackGround15"]' ): prod = Product(product) prod['identifier'] = option.select( u'normalize-space(./td[1]/text())').extract()[0] prod['sku'] = option.select( u'normalize-space(./td[1]/text())').extract()[0] if option.select(u'./td[position()=1 and @colspan="4"]'): continue elif option.select(u'./td[4]//td[1]/text()').extract(): prod['name'] = prod['name'].strip() + ' ' + option.select( u'normalize-space(./td[3]/a/text())').extract()[0] prod['price'] = extract_price( option.select(u'./td[4]//td[1]/text()').extract()[0]) elif option.select(u'./td[6]//td[1]/text()').extract(): prod['name'] = prod['name'].strip() + ' ' + option.select( u'normalize-space(./td[4]/a/text())').extract()[0] prod['price'] = extract_price( option.select(u'./td[6]//td[1]/text()').extract()[0]) prod['identifier'] = option.select( u'normalize-space(./td[2]/text())').extract()[0] prod['sku'] = option.select( u'normalize-space(./td[2]/text())').extract()[0] elif option.select(u'./td[3]//td[1]/text()').extract(): prod['price'] = extract_price( option.select(u'./td[3]//td[1]/text()').extract()[0]) else: continue if prod['identifier'].strip(): yield prod
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta l = ProductLoader(item=Product(), response=response) sku = meta.get('sku') if meta.get('sku', None) else hxs.select( '//div[@class="buy"]//select[@id="ProductItemId"]/@data-wpc' ).extract()[0] l.add_value('sku', sku) identifier = hxs.select( '//input[@id="AddToBasketMain"]/@data-event-label').extract() identifier = identifier[0] if identifier else sku l.add_value('identifier', identifier) brand = hxs.select( '//div[@class="info"]/h2[@class="designer"]/a/text()').extract()[0] name = hxs.select( '//div[@class="info"]/h3[@class="description"]/text()').extract( )[0].strip() l.add_value('name', brand + ' ' + name) brand = meta.get('brand') if meta.get('brand', None) else brand l.add_value('brand', brand) url = meta.get('url') if meta.get('url', None) else response.url l.add_value('url', url) image_url = hxs.select( '//a[@class="zoom"]/img[@class="product-image" and contains(@src, "_1_")]/@src' ).extract() if image_url: l.add_value('image_url', image_url[0]) l.add_value('category', meta.get('category')) price = hxs.select( '//div[@class="details"]//div[@class="pricing"]/div[@class="price"]/span[@class="sale"]/text()' ).extract() if price: price = extract_price(price[0]) else: price = hxs.select( '//div[@class="details"]//div[@class="pricing"]/div[@class="price"]/text()' ).extract() if price: price = extract_price(price[0]) else: price = 0 l.add_value('price', price) l.add_value('shipping_cost', 20) out_of_stock = hxs.select( '//div[@class="detail-allsizesoutofstock visible"]') if out_of_stock: l.add_value('stock', 0) yield l.load_item()
def parse_product(self, response): name = ' '.join(response.xpath('//h1/text()').extract()[0].split()) identifier = response.xpath( '//input[@id="itemsArray"]/@value').extract()[0] sku = response.xpath('//span[@itemprop="mpn"]/text()').extract() sku = sku[0].strip() if sku else '' price = response.xpath('//span[@itemprop="price"]/text()').extract() price = extract_price(price[0]) if price else '0' brand = response.xpath( '//dd[contains(@itemtype, "Organization")]//a/text()').extract() brand = brand[0].strip() if brand else '' categories = response.xpath( '//div[@id="breadcrumb"]//a/text()').extract()[1:-1] product_image = response.xpath( '//img[@id="productMainImage"]/@src').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('price', price) if product_image: loader.add_value('image_url', response.urljoin(product_image[0])) loader.add_value('brand', brand) loader.add_value('category', categories) stock = response.xpath( '//span[contains(@class, "availability")]//text()').re('\d+') if not stock: loader.add_value('stock', 0) else: stock = extract_price(stock[0]) loader.add_value('stock', stock) product = loader.load_item() metadata = TranscatMeta() metadata['reviews'] = [] product['metadata'] = metadata reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=tkfeqezs3t1ybjthb77uxbvqd&apiversion=5.5&displaycode=1015-en_us&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + identifier + "&filter.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&sort.q0=submissiontime%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_CA%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_4827" request = Request(reviews_url, meta={ 'product': product, 'offset': 0, 'identifier': identifier }, callback=self.parse_reviews) yield request
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select( '//div[@class="product-tile"]//a/@href').extract(): pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = hxs.select('//h1/text()').extract() if not name: request = self.retry(response, "No name for product: " + response.url) if request: yield request return product_loader.add_value('name', name) category = hxs.select( '//ol[@class="breadcrumbs"]//a/text()').extract()[1:] product_loader.add_value('category', category) img = hxs.select('//div[@class="item"]//img/@src').extract() if img: product_loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), img.pop(0))) product = product_loader.load_item() options = hxs.select(u'//div[contains(@class, "MainProds")]/ol/li') if not options: options = hxs.select( u'//div[@class="SingColl"]/div[contains(@class, "Prod")]') if True: if not options or len(options) == 1: prod = Product(product) prod['sku'] = hxs.select('//div[@class="product-sku"]/text()' ).re('Product code: (\w+)').pop() prod['identifier'] = prod['sku'] prod['price'] = extract_price( hxs.select('//div[@class="price-current"]/text()').extract( ).pop()) if prod['identifier']: yield prod else: for opt in options: prod = Product(product) prod['name'] = opt.select( u'normalize-space(.//h2/text())').extract()[0] prod['sku'] = \ opt.select(u'normalize-space(substring-after(.//div[@class="code"]/text(), ":"))').extract()[0] prod['identifier'] = prod['sku'] prod['price'] = extract_price( opt.select( u'.//span[@class="Price"]/text()').extract()[0]) yield prod
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_name = hxs.select('//h1[@itemprop="name itemreviewed"]/text()').extract() if not product_name: return product_name = product_name[0].strip() image_url = hxs.select('//div[@class="product_main"]//img[@itemprop="image photo"]/@src').extract() if not image_url: image_url = hxs.select('//img[@itemprop="image photo"]/@src').extract() brand = hxs.select('//a[@class="brand-link"]/img/@title').extract() sku = hxs.select('//p[@itemprop="identifier"]/@content').extract()[0] sku = sku.replace('sku:', '') options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join((products.get(product, ''), option['label'])) for identifier, option_name in products.iteritems(): loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('name', product_data['childProducts'][identifier]['productName']) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = extract_price(product_data['childProducts'][identifier]['price']) price = price * self.exchange_rate * Decimal(1.2) + 100 loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('category', response.meta.get('category', '')) if brand: loader.add_value('brand', brand[0]) loader.add_value('sku', sku) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', product_name) if brand: loader.add_value('brand', brand[0]) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//div[@class="price-box"]//span[@class="price"]/text()').extract() if price: price = extract_price(price[0].replace(',', '')) price = price * self.exchange_rate * Decimal(1.2) + 100 else: price = 0 loader.add_value('price', price) loader.add_value('category', response.meta.get('category', '')) loader.add_value('sku', sku) identifier = hxs.select('//div[@class="no-display"]//input[@name="product"]/@value').extract()[0] loader.add_value('identifier', identifier) yield loader.load_item()
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username = username, password = password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file(self.file_start_with, files) sftp.get(last.filename, self.xls_file_path) # Convert XLXS file to CSV excel_to_csv(self.xls_file_path, self.csv_file_path) with open(self.csv_file_path) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: if row['BI ProductID'].lower() in self.identifiers: continue self.identifiers.append(row['BI ProductID'].lower()) loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['BI ProductID']) loader.add_value('sku', row['BI ProductID']) loader.add_value('category', unicode(row['BI Product Grp'], errors='ignore')) loader.add_value('category', unicode(row['BI CategoryGroup'], errors='ignore')) loader.add_value('name', unicode(row['BI ProductName'], errors='ignore')) loader.add_value('price', extract_price(row['BI ListPrice'])) loader.add_value('shipping_cost', extract_price(row['BI Shipping'])) loader.add_value('brand', unicode(row['BI Brand'], errors='ignore')) loader.add_value('url', '') if self.image_url_key: image_url = row.get(self.image_url_key) if image_url.lower() != 'na': loader.add_value('image_url', image_url) else: loader.add_value('image_url', '') product = loader.load_item() metadata = BIWordlwideMeta() metadata['dropship_fee'] = unicode(row['BI Dropship Fee'], errors='ignore') metadata['est_tax'] = unicode(row['BI Est Tax'], errors='ignore') metadata['ship_weight'] = unicode(row['BI ship Wt'], errors='ignore') metadata['product_group'] = unicode(row['BI Product Grp'], errors='ignore') metadata['upc'] = unicode(row['BI UPC #'], errors='ignore') metadata['mpn'] = unicode(row['BI Model'], errors='ignore') metadata['item_group'] = unicode(row.get('BI ItemGroup', ''), errors='ignore') for meta_key, feed_key in self.tag_keys.items(): tag = unicode(row.get(feed_key, ''), errors='ignore') tag = tag if tag != u'NA' else u'N/A' metadata[meta_key] = tag product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1[1]/text()') price = ''.join( hxs.select( u'//div/span[position()=2 and contains(text(),"VAT") and contains(text(),"ex.")]/../span[1]//text()' ).extract()) product_loader.add_value('price', extract_price(price)) product_loader.add_xpath( 'sku', u'substring-after(//div[contains(text(),"Product Code:")]/text(), ":")' ) product_loader.add_xpath( 'category', u'//span[@class="breadcrumblink" and position()=3]/a/text()') img = hxs.select( u'//a[starts-with(@id,"img") and contains(@class,"mainImageParent")]/@href' ).extract() if not img: img = hxs.select( u'//div[contains(@class,"proPicHolder")]/a/img/@src').extract( ) product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) js = ''.join(hxs.select(u'//script/text()').extract()) brand = re.search(u's.prop3=Trim\("(.+)"\);', js) if brand: product_loader.add_value('brand', brand.group(1)) # product_loader.add_xpath('shipping_cost', '') product = product_loader.load_item() if not product['price'] and not product['sku']: rows = hxs.select( u'//table/tbody/tr[@id="rangeHeader"]/../tr[position()!=1 and position()!=last()]' ) for i, row in enumerate(rows): if row.select(u'./td[2]/a/@href'): # Comparison table with links to products break p = Product(product) p['name'] = p['name'] + ' ' + row.select( u'../tr[1]//table//tr[%d]/td/div/text()' % (i + 2)).extract()[0] p['sku'] = row.select(u'./td[2]/text()').extract()[0] #p['price'] = extract_price(row.select(u'./td/div[@id="priceExcVAT1"]/text()').extract()[0]) p['price'] = extract_price( row.select(u'./td/div/div[2]/text()').extract()[0]) yield p else: yield product
def parse_size(self, response): item = response.meta['item'] data = json.loads(response.body) item['price'] = extract_price(data['discount_price_promotion_display']) yield item if data['with_lens_price']: item['price'] = extract_price(data['with_lens_price']) item['identifier'] += '-with_lens' item['name'] += ' with lenses' yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//h1//span[@itemprop="name"]/text()').extract()[0] brand = hxs.select('//h1//span[@itemprop="brand"]/text()').extract() brand = brand[0].strip() if brand else '' if brand.upper() in self.ignore_brands: return image_url = hxs.select( '//div[@class="productimg_container"]/a/img/@src').extract() identifier = hxs.select( '//*[@id="basketform"]//input[@name="product_id"]/@value').extract( ) if not identifier: identifier = hxs.select('//span[@itemprop="sku"]/text()').extract() identifier = identifier[0] price = response.css('.product_price #exvatprice ::text').extract() if not price: price = response.xpath( '//span[@itemprop="price"]/text()').extract() if not price: self.log('Warning: no price found! %s' % response.url) return price = extract_price(price[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('price', price) stock = hxs.select( '//span[@class="stockstatus"]/span[@class="stock" and contains(text(), "In Stock")]' ) if not stock: loader.add_value('stock', 0) loader.add_value('identifier', identifier) loader.add_value('name', name) categories = hxs.select( '//div[@itemprop="breadcrumb"]//a/text()').extract() loader.add_value('category', categories) loader.add_value('brand', brand) loader.add_xpath('sku', '//td[@itemprop="mpn"]/text()') loader.add_value('url', response.url) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) shipping = hxs.select( '//div[@class="delivery_text"]/strong/text()').extract() if shipping: if shipping[0] == 'FREE': loader.add_value('shipping_cost', 0) else: loader.add_value('shipping_cost', extract_price(shipping[0])) yield loader.load_item()
def parse_shipping_price2(response): hxs = HtmlXPathSelector(response) product = response.meta['product'] shipping = hxs.select('//tr[@class="ordershipping"]/td[2]/span/text()').extract() if shipping: shipping = extract_price(shipping[0]) shipping_discount = hxs.select('//tr[@class="ordershippingdiscount discount"]/td[2]/span/text()').extract() if shipping_discount: shipping -= extract_price(shipping_discount[0]) product['shipping_cost'] = shipping yield product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select('//*[@id="Pbox"]/nav[1]/a[3]/text()').extract()[0] brand = '' for b in self.brands: if name.upper().startswith(b.upper()): brand = b break identifier = re.findall('roduct.ia\/(\d+)\/', response.url)[0] image_url = hxs.select('//*[@id="thumbnail"]//li/a/@href').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' category = response.meta.get('categories') products = hxs.select( '//*[@id="priceBox"]/form/table/tr[position()>1]') for product in products: item = Product() list_price = extract_price( product.select( './td[2]/span[@class="tGrey"]/text()').extract()[0]) metadata = CRCMeta() metadata['rrp'] = list_price item['metadata'] = metadata loader = ProductLoader(item=item, selector=product) option_name = product.select( './td/b[@class="name"]/text()').extract()[0] loader.add_value('name', name + ', ' + option_name) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('category', category) loader.add_value('image_url', image_url) option_id = product.select('./td/div/button/@onclick').extract()[0] if 'proEmailMe' in option_id: option_id = option_id.split('proEmailMe(')[1].split( ')')[0].replace("'", '').split(',') else: option_id = option_id.split('shopCartAdd(')[1].split( ')')[0].replace("'", '').split(',') option_id = option_id[0].strip() + '_' + option_id[1].strip() loader.add_value('identifier', identifier + '_' + option_id) price = product.select('./td[2]/b/text()').extract() price = extract_price(price[0]) loader.add_value('price', price) sku = product.select('./td[1]/div/text()').extract() sku = sku[0] if sku else '' loader.add_value('sku', sku) stock = product.select( './td[1]/span[@class="tGrey"]/text()').extract()[0] if stock != 'In Stock': loader.add_value('stock', 0) yield loader.load_item()
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) file_path = HERE + '/Lakeland_products.csv' sftp.get('Lakeland.csv', file_path) with open(file_path) as f: reader = csv.DictReader((line.replace('\x00', '') for line in f), delimiter="|") for row in reader: loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', row['Unique Product Code']) loader.add_value('identifier', row['Unique Product Code']) loader.add_value('name', row['Product Name']) loader.add_value('category', row['Category']) loader.add_value('image_url', row['Image URL']) loader.add_value('brand', row['Brand'].decode('latin-1')) loader.add_value('url', row['Product Page URL']) list_price = str(round(extract_price(row['List Price']), 2)) cost_price = str(round(extract_price(row['Cost Price']), 2)) rrp = str(round(extract_price(row['RRP']), 2)) selling_price = round(extract_price(row['Price']), 2) if selling_price > 0: margin = ( (Decimal(rrp) / Decimal('1.2') - Decimal(cost_price)) / Decimal(selling_price)) * Decimal('1.2') else: margin = Decimal('0.00') margin *= Decimal('100') margin = '{}%'.format(str(round(extract_price(str(margin)), 2))) loader.add_value('price', selling_price) loader.add_value('stock', row['Stock Availability']) loader.add_value('shipping_cost', row['Shipping Cost']) item = loader.load_item() metadata = LakelandMeta() metadata['margin'] = margin metadata['promotional_message'] = row['Promotional Message'] metadata['buyer_name'] = row['Buyer Name'] metadata['list_price'] = list_price metadata['cost_price'] = cost_price metadata['asin'] = row['ASIN'] metadata['dd'] = 'Yes' if row['DD'] == '1' else '' metadata['rrp'] = rrp item['metadata'] = metadata yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) name = response.xpath('//h1[@itemprop="name"]/text()').extract() if name: name = name[0].strip() else: retry_count = response.meta.get('retry_count', 0) if retry_count < 3: yield Request(response.url, dont_filter=True, callback=self.parse_product, meta={'retry_count': retry_count + 1}) else: self.log('Product without name: ' + response.url) return loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('brand', response.meta.get('brand')) image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) loader.add_value('image_url', image_url) available = ''.join(response.xpath('//div[@itemprop="offers"]//div[contains(@class,"tr-prod-availability")]//text()') .extract())\ .strip().upper() if available: if 'AVAILABLE IMMEDIATELY' not in available.upper(): loader.add_value('stock', 0) price = response.xpath( '//*[@itemprop="price"]/following-sibling::span/text()').extract() euro_price = response.xpath( '//*[@itemprop="price"]/text()').extract_first() price = extract_price(price[0]) if price else 0 euro_price = extract_price(euro_price) if euro_price else 0 loader.add_value('price', price) category = response.xpath( '//ul[@class="tr-sidebar-categories-main"]/li/a/text()').extract() if category: loader.add_value('category', category[0]) sku = response.xpath('//input[@name="ar"]/@value').extract() sku = sku[0] if sku else '' loader.add_value('sku', sku) loader.add_value('identifier', sku) if int(price) <= 165: loader.add_value('shipping_cost', 8.3) item = loader.load_item() item['metadata'] = {'Euro Price': euro_price} yield item
def parse(self, response): transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file("BI UK File", files) file_path = HERE + '/biwuk_products.csv' sftp.get(last.filename, file_path) with open(file_path) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['BI UK ProductID']) loader.add_value('sku', row['BI UK ProductID']) categories = unicode(row.get('BI UK CategoryGroup'), errors='ignore').replace( '>>', '>').replace("'", "").split('>') for category in categories: loader.add_value('category', category.strip()) loader.add_value( 'name', unicode(row['BI UK ProductName'], errors='ignore')) loader.add_value('price', extract_price(row['BI UK Delivered Price'])) loader.add_value('shipping_cost', extract_price(row['BI UK Shipping'])) loader.add_value('brand', row['BI UK Brand']) loader.add_value('url', '') loader.add_value('image_url', row['BI UK ImgURL']) product = loader.load_item() metadata = BIWordlwideMeta() metadata['dropship_fee'] = unicode(row['BI UK Dropship Fee'], errors='ignore') metadata['est_tax'] = unicode(row['BI UK Est Tax'], errors='ignore') metadata['ship_weight'] = unicode(row['BI UK ship Wt'], errors='ignore') metadata['product_group'] = unicode(row['BI UK Product Grp'], errors='ignore') metadata['upc'] = unicode(row['BI UK UPC #'], errors='ignore') metadata['mpn'] = unicode(row['BI UK Model'], errors='ignore') product['metadata'] = metadata yield product
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip() loader.add_value('name', name) identifier = hxs.select('//div[@id="article_cont"]//div[@class="itemcostinfo"]/@itemid').extract()[0] loader.add_value('identifier', identifier) sku = response.meta.get('sku', '') if sku: loader.add_value('sku', sku) loader.add_value('brand', response.meta.get('brand', '')) else: sku = hxs.select('//span[@id="hbNrDataSheet"]/text()').extract() if sku: loader.add_value('sku', sku[0]) loader.add_value('brand', 'Logitech') loader.add_value('url', response.url) image_url = hxs.select('//ul[@id="gliderSmall"]/li/a/@onclick').extract() if image_url: match = re.search(r"imgSrc: '(.*?)',", image_url[0]) if match: image_url = match.group(1) loader.add_value('image_url', urljoin_rfc(base_url, image_url)) else: image_url = hxs.select('//img[@id="itemDetail{}"]/@src'.format(identifier)).extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select('//meta[@itemprop="price"]/@content').extract() price = extract_price(price[0]) loader.add_value('price', price) in_stock = hxs.select('//meta[@itemprop="availability"]/@content').extract()[0].strip() if in_stock == 'out_of_stock': loader.add_value('stock', 0) category = hxs.select('//*[@id="main-centercontainer"]/div[1]/a[3]/text()').extract() if category: loader.add_value('category', category[0]) shipping = '0.0' shipping_base = hxs.select('//div[@class="clear fl b8"]/span[@class="basis fl"]/text()').extract() if shipping_base: shipping = shipping_base[0].replace(',', '') shipping_decimal = hxs.select('//div[@class="clear fl b8"]/span[@class="decimal fl"]/text()').extract() if shipping_decimal: shipping += '.' + shipping_decimal[0] loader.add_value('shipping_cost', extract_price(shipping)) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product = response.meta['product'] forms = hxs.select('//form[contains(@action, "BuyPrintNGo")]') image_url = hxs.select( '//div[@id="imagegallerymain"]/div//img/@src').extract() if image_url: product['image_url'] = urljoin_rfc(base_url, image_url[0]) summary = ' '.join( map(unicode.strip, hxs.select('//*[@id="summary"]//text()').extract())).strip() if not summary: summary = ''.join( map( unicode.strip, hxs.select( '//div[@id="accordion"]//div[contains(@class, "panel-heading") and contains(h4/a/text(), "Summary")]/following-sibling::div//div[contains(@class, "panel-body")]/p/text()' ).extract())).strip() if not summary: summary = ''.join( map( unicode.strip, hxs.select( '//div[@id="accordion"]//div[contains(@class, "panel-heading") and contains(h4/a/text(), "Summary")]/following-sibling::div//div[contains(@class, "panel-body")]//text()' ).extract())).strip() if not summary: summary = ''.join( hxs.select( '//*[@id="fine_print"]//*[@class="info_section"]//text()'). extract()).strip() if 'metadata' in product: product['metadata']['summary'] = summary else: metadata = {} metadata['summary'] = summary product['metadata'] = metadata if not forms: yield product else: for form in forms: identifier = form.select( './/input[@name="productCode"]/@value').extract().pop() if product['identifier'] != identifier: continue for line in form.select('.//table/tr'): item = deepcopy(product) title = line.select('./td[@class="name"]/text()').extract() variant_id = line.select( './/input[contains(@name, "hdnVariantName_")]/@value' ).extract() price = line.select( './/input[contains(@name, "RP_")]/@value').extract() if title and variant_id and price: item = deepcopy(product) item['name'] = "%s - %s" % (item['name'], title[0]) item['identifier'] = "%s-%s" % (identifier, variant_id[0]) item['price'] = extract_price(price[0]) yield item
def _parse_product_el(self, product_el, base_url): name = product_el.select( 'a/div[@class="prd_details"]/h2/text()').extract()[0].strip() price = product_el.select( 'a/div[@class="prd_details"]/div[@class="prd_price_area"]/span/text()' ).extract()[0].strip() price = extract_price(price) url = product_el.select("a/@href").extract()[0] product_code = extract_product_code_from_url(url).lower() image_url = product_el.select( 'a/span[@class="prd_img"]/img/@data-original').extract()[0] loader = ProductLoader(selector=product_el, item=Product()) loader.add_value('identifier', product_code) loader.add_value('sku', product_code) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('url', url) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url)) category = category_codes.get(product_code[0:3]) loader.add_value('category', category) metadata = BuyAGiftMeta() if product_code in self.suppliers: supplier_list = self.suppliers[product_code]['suppliers'] metadata['supplier_name'] = ', '.join(supplier_list) product = loader.load_item() product['metadata'] = metadata self.collected.add(product_code) return product
def parse_product_clearance(self, response): URL_BASE = get_base_url(response) hxs = HtmlXPathSelector(response) name = hxs.select("//div[contains(@class, 'prodDetails')]/h2/text()").extract() if not name: logging.error("ERROR!! NO NAME PRODUCT PAGE!! %s" % (response.url, )) return name = " ".join([x.strip() for x in name]) url = response.url price = hxs.select("//div[@class='pricing']/h4/text()").extract() if not price: logging.error("ERROR!! NO PRICE PRODUCT PAGE!! %s %s" % (response.url, name)) return price = extract_price(price[0].strip()) if not price: logging.error("ERROR!! NO PRICE PRODUCT PAGE!! %s %s" % (response.url, name)) return product = Product() loader = ProductLoader(item=product, response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item()
def parse_options(self, response): loader = response.meta['loader'] seen = [] url = loader.get_output_value('url') main_name = loader.get_output_value('name') options_data = json.loads(response.body) base_price = extract_price(str(options_data['startingPrice'])) if base_price: p = ProductLoader(response=response, item=Product()) p.add_value('url', url) p.add_value('name', main_name) p.add_value('price', base_price) yield p.load_item() for option in options_data['options']: for value in option['values']: if value.get('cost') and not value['description'] in seen: seen.append(value['description']) p = ProductLoader(response=response, item=Product()) p.add_value('url', url) p.add_value('name', main_name + ' ' + value['description']) p.add_value('price', base_price + extract_price(str(value['cost']))) yield p.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) main_name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0].strip() main_price = hxs.select('//span[@itemprop="price"]/text()').extract() if not main_price: main_price = hxs.select('//input[@name="ppi"]/@value').extract() main_price = extract_price(main_price[0]) loader = ProductLoader(response=response, item=Product()) loader.add_value('name', main_name) loader.add_value('price', main_price) loader.add_value('url', response.url) loader.add_xpath('sku', '//span[@itemprop="identifier"]/text()') yield loader.load_item() def _add_options(option_sets, current_name, current_price): if not option_sets and current_price > main_price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_value('name', current_name) loader.add_value('price', current_price) yield loader.load_item() else: options = option_sets[0] option_sets = option_sets[1:] for option in options.select('./option/text()').extract(): r = re.search('(.*)\(Add(.*)\)', option) name = current_name price = current_price if r: name += ' ' + r.groups()[0].strip() price += extract_price(r.groups()[1]) else: name += ' ' + option for product in _add_options(option_sets, name, price): yield product option_sets = hxs.select('//div[@class="inn"]/select') if option_sets: for product in _add_options(option_sets, main_name, main_price): yield product
def _add_options(option_sets, current_name, current_price): if not option_sets and current_price > main_price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_value('name', current_name) loader.add_value('price', current_price) yield loader.load_item() else: options = option_sets[0] option_sets = option_sets[1:] for option in options.select('./option/text()').extract(): r = re.search('(.*)\(Add(.*)\)', option) name = current_name price = current_price if r: name += ' ' + r.groups()[0].strip() price += extract_price(r.groups()[1]) else: name += ' ' + option for product in _add_options(option_sets, name, price): yield product