def _parse_json(self, response): product = response.meta['product'] data = json.loads(response.body_as_unicode()) prod_data = data[0] upc = prod_data.get('skus')[0].get('productNumber') prices = set(x.get('currentPrice') for x in prod_data.get('skus')) if len(prices) < 2: product['upc'] = upc price = prod_data.get('currentPrice') cond_set_value(product, 'price', Price('USD', price) if price else None) cond_set_value(product, 'title', prod_data.get('name')) return product self.log("Product %s PRICES= %s " % (upc, prices), DEBUG) products = [] for skudata in prod_data.get('skus'): new_product = product.copy() new_product['upc'] = skudata.get('productNumber') price = skudata.get('currentPrice') cond_set_value(new_product, 'price', Price('USD', price) if price else None) new_product['title'] = prod_data.get('name') new_product['model'] = "size:" + skudata.get('product_size') products.append(new_product) return products
def _get_json_data(self, item): product = SiteProductItem() item = item['productInfo'] if 'salePrice' in item['priceInfo']: price = re.findall('(/?\d+.\d+)', item['priceInfo']['salePrice']) if len(price) == 1: product['price'] = Price(price=float(price[0]), priceCurrency='USD') else: product['price'] = Price(price=float(price[-1]), priceCurrency='USD') elif 'regularPrice' in item['priceInfo']: price = re.findall('(/?\d+.\d+)', item['priceInfo']['regularPrice']) if len(price) == 1: product['price'] = Price(price=float(price[0]), priceCurrency='USD') else: product['price'] = Price(price=float(price[-1]), priceCurrency='USD') messages = item.get('channelAvailability', []) for mes in messages: if 'displayText' in mes: if 'Not sold online' in mes['displayText']: product['is_in_store_only'] = True if 'Out of stock online' in mes['displayText']: product['is_out_of_stock'] = True upc = item.get('upc') cond_set_value(product, 'upc', upc) return product
def parse_price(self, response): price = response.xpath( '//p[contains(@id, "pd-price")]/text()').extract() if price: price = self.clear_text(price[0].replace('NOW', '').replace('$', '')) return Price(price=price, priceCurrency="USD") else: return Price(price=0.00, priceCurrency="USD")
def _parse_price(response): """Parse price""" currency = response.xpath( '*//*[@itemprop="priceCurrency"]/@content').extract() price = response.xpath('*//*[@itemprop="price"]/text()').re( FLOATING_POINT_RGEX) if currency and price: return Price(price=price[0], priceCurrency=currency[0]) else: return Price(price=0, priceCurrency='USD')
def parse_marketplace_json(self, response): marketplaces = [] meta = response.meta product = meta['product'] data = response.body_as_unicode() seller_id = meta.get('seller_id') product_id = meta.get('product_id') try: data = is_empty( re.findall(r'parentItem":"{0}"(.*)?'.format(seller_id), data)).replace('\\', '') marketplace = Selector(text=data) except: return product sellers_noline = list( set( marketplace.xpath( "//tr[contains(@class, featured)]/td/img/@alt").extract())) sellers_line = marketplace.xpath( "//tr/td[@class='seller']/a[1]/@title").extract() new_sellers_line = self.remove_duplicate(sellers_line) sellers = sellers_noline + new_sellers_line price_int = marketplace.xpath( "//ul[contains(@class, 'price')]/li[@class='price-current ']/strong/text()" ).extract() price_sup = marketplace.xpath( "//ul[contains(@class, 'price')]/li[@class='price-current ']/sup/text()" ).extract() for i, item in enumerate(sellers): try: price = price_int[i] + price_sup[i] except: price = 0.0 if price: price = Price(price=price, priceCurrency="USD") else: price = Price(price=0.0, priceCurrency="USD") marketplaces.append({"price": price, "name": item}) if marketplaces: product["marketplace"] = marketplaces reqs = meta.get('reqs', []) reqs.append( Request(url=self.RELATED_PRODUCTS.format(product_id=product_id, seller_id=seller_id), dont_filter=True, callback=self.parse_related_product, meta=meta)) if reqs: return self.send_next_request(reqs, response) return product
def parse_price(self, response): price = response.xpath( '//meta[contains(@itemprop, "price")]/@content').extract() currency = response.xpath( '//meta[contains(@itemprop, "priceCurrency")]/@content').extract() if price and currency: price = Price(price=price[0], priceCurrency=currency[0]) else: price = Price(price=0.00, priceCurrency="USD") return price
def _parse_price(self, response): price_sel = response.xpath('//meta[@itemprop="price"]' '/@content') price = is_empty(price_sel.extract()) price_currency_sel = response.xpath('//meta[@itemprop="priceCurrency"]' '/@content') price_currency = is_empty(price_currency_sel.extract()) if price and price_currency: price = Price(price=price, priceCurrency=price_currency) else: price = Price(price=0.00, priceCurrency="GBP") return price
def parse_price(self, response): if self.js_data: price = self.js_data['colorid'][self.product_id]['price'] for price_data in price: if price_data['il8n'] == 'now': price = price_data['amount'] currency = is_empty(re.findall(r'currency":"(\w+)"', response.body_as_unicode())) if price and currency: price = Price(price=price, priceCurrency=currency) else: price = Price(price=0.00, priceCurrency="USD") return price
def _populate_from_html(self, response, product): _populate_from_open_graph_product(response, product) cont = '#productDetailsLeftSidebar .inner-container ' cond_set(product, 'title', response.css(cont + 'h1::text').extract(), unicode.strip) if not product.get("title"): title = response.xpath( "//h1[contains(@class, 'prod_name')]/text()").extract() if title: cond_set(product, 'title', title, unicode.strip) regex = "\/_\/([^?$\s]+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) price = response.xpath( '//div[@id="productPrice"]' \ '/div[contains(@class, "display_price")]/input/@value |' '//div[@id="productPrice"]/span[last()]/text()' ).extract() if price: price = price[0].replace("$", "").strip() product["price"] = Price(priceCurrency='USD', price=price) model = response.css('#storeStyleNumber::text').extract() if model: model = re.search(r'Store Style #:\xa0(.+)', model[0]) cond_set_value(product, 'model', model, lambda model: model.group(1)) self._populate_related_products(response, product) self._populate_hardcoded_fields(product)
def parse_price(self, response): meta = response.meta.copy() product = meta['product'] price_sel = response.xpath('//span[@itemprop="price"]/' 'span[@class="price"]/text() | ' '//p[@class="special-price"]/' 'span[@itemprop="price"]/text()') if price_sel: price = is_empty(price_sel.extract()).strip() price = is_empty(re.findall(r'\d+,\d+', price)) price = price.replace(',', '.') product['price'] = Price(priceCurrency="EUR", price=price) return 'price' else: product['price'] = Price(priceCurrency="EUR", price=float(0))
def parse_marketplace(self, response): product = response.meta['product'] try: data = json.loads(response.body) except ValueError: return product sel = Selector(text=data.get("html", "")) marketplaces = [] for seller in sel.xpath("//div[contains(@class, 'unit')]"): price = is_empty( seller.xpath( "div/div/div[contains(@class, 'larg-price')]/text()").re( FLOATING_POINT_RGEX)) name = is_empty(seller.xpath("div/div/span/a/text()").extract()) marketplaces.append({ "price": Price(price=price, priceCurrency="AED"), "name": name }) if marketplaces: product["marketplace"] = marketplaces return product
def _parse_store_status(self, response): """Checking availability in stores and adding store price to product""" reqs = response.meta['reqs'] product = response.meta['product'] try: currency = re.findall('priceCurrency=(.*?),',str(product['price']))[0] except: currency = 'CAD' data = json.loads(response.body_as_unicode()) for store in data['products'][0]['results']: try: if store['availability'] != '70': #Not in store status price = store['minCurrentPrice'] else: price = None except KeyError: price = None continue if price: product['price'] = Price(priceCurrency=currency, price=str(price)) break if price: if product['is_out_of_stock']: product['is_in_store_only'] = True else: product['is_in_store_only'] = False if reqs: return self.send_next_request(reqs, response) return product
def _populate_from_html(self, response, product): reseller_id = re.findall('\/sku(\d+)', response.url) # reseller_id = reseller_id[0] if reseller_id else None cond_set(product, 'reseller_id', reseller_id) cond_set(product, 'title', response.css('[itemprop=name]::text').extract()) cond_set(product, 'brand', response.css('#ctl00_content_lnkBrand::text').extract()) cond_set(product, 'price', response.css('[itemprop=price]::text').extract()) if product.get('price', '') and not isinstance(product['price'], Price): if not 'Rp' in product['price']: self.log('Unrecognized currency at %s' % response.url) else: product['price'] = Price( price=product['price'].lower().replace( 'rp', '').replace(',', '').strip(), priceCurrency='IDR' ) cond_replace(product, 'image_url', response.css('#prodMedia img::attr(src)').extract()) specs = response.css('.spesifications').extract() specs = specs[0] if specs else '' description = product.get('description', '') + specs.strip() cond_replace_value(product, 'description', description) self._get_model_from_title(product)
def _parse_price(self, data): price = data.get('currentPrice') if price: price = Price(priceCurrency="GBP", price=price) return price
def _parse_single_product(self, response): productdata = "[" + is_empty( response.xpath('//meta[@name="productdata"]/@content').extract(), "")[:-1].replace("|", ",") + "]" productdata = is_empty(json.loads(productdata)) product = SiteProductItem() if productdata: product["title"] = productdata["name"] product["is_out_of_stock"] = not productdata["available"] product["url"] = "http://www.tesco.com/groceries/product/details/"\ "?id=" + str(productdata["productId"]) regex = "id=([A-Z0-9\-]+)" reseller_id = re.findall(regex, product.get('url', '')) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) try: product["price"] = Price(price=productdata["price"], priceCurrency="GBP") except: pass product["image_url"] = productdata["mediumImage"] product["search_term"] = "" product["brand"] = is_empty(self.brand_from_title( product["title"])) product["site"] = is_empty(self.allowed_domains) if self.product_url: product['is_single_result'] = True if product.get("search_term"): del product['search_term'] return product
def _scrape_product_links(self, response): data = WaitroseProductsSpider._get_data(response) for product_data in data['products']: product = SiteProductItem() for product_key, data_key in self._PRODUCT_TO_DATA_KEYS.items(): value = product_data.get(data_key, 'null') if value != 'null': product[product_key] = product_data[data_key] image_url = product.get('image_url', 'None') if image_url: product['image_url'] = urlparse.urljoin('http://', image_url) # This one is not in the mapping since it requires transformation. #product['upc'] = int(product_data['productid']) if product.get('price', None): price = product['price'] price = price.replace('£', 'p') price = re.findall('(p? *[\d ,.]+ *p?) *', price) price = price[0] if price else '' if price.endswith('p'): price = '0.' + price.strip() if 'p' in price: price = re.sub('[p ,]', '', price) product['price'] = Price(priceCurrency='GBP', price=price) else: self.log('Unknown price format at %s' % response) if not product.get('url', '').startswith('http'): product['url'] = urlparse.urljoin('http://www.waitrose.com', product['url']) yield product['url'], product
def _get_price(self, response, product): """ Parses and sets the product price, with all possible variations :param response: Scrapy's Response obj :param product: Scrapy's Item (dict, basically) :return: None """ cond_set( product, 'price', response.css('#priceblock_ourprice ::text' ', #unqualifiedBuyBox .a-color-price ::text' ', #priceblock_saleprice ::text' ', #actualPriceValue ::text' ', #buyNewSection .offer-price ::text').extract(), ) if not product.get('price', None): cond_set( product, 'price', response.xpath( '//td/b[@class="priceLarge"]/text() |' '//span[@class="olp-padding-right"]' '/span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_hd_movie")]' '/button/text() |' '//span[@id="priceblock_saleprice"]/text() |' '//li[@class="swatchElement selected"]' '//span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_sd_movie")]' '/button/text() |' '//div[@id="mocaBBRegularPrice"]' '/div/text()[normalize-space()]').extract()) if product.get('price', None): if not '$' in product['price']: if 'FREE' in product['price'] or ' ' in product['price']: product['price'] = Price(priceCurrency='USD', price='0.00') else: self.log('Currency symbol not recognized: %s' % response.url, level=ERROR) else: price = re.findall('[\d ,.]+\d', product['price']) price = re.sub('[, ]', '', price[0]) product['price'] = Price( priceCurrency='USD', price=price.replace('$', '').strip()\ .replace(',', '') )
def _unify_price(self, product): price = product.get('price') if price is None: return is_usd = not price.find('$') price = price[1:].replace(',', '') if is_usd and price.replace('.', '').isdigit(): product['price'] = Price('USD', price)
def parse_product(self, response): product = response.meta['product'] title_list = response.xpath( "//h1[@class='productTitle'][1]//text()").extract() if len(title_list) >= 2: cond_set_value(product, 'title', self.clear_desc(title_list[-2:])) cond_set( product, 'price', response.xpath( "//div[@id='bopRight']//meta[@itemprop='price']/@content"). extract()) if product.get('price', None): if isinstance(product['price'], str): product['price'] = product['price'].decode('utf8') if not u'£' in product['price']: self.log('Unknown currency at %s' % response.url, level=ERROR) else: product['price'] = Price(priceCurrency='GBP', price=product['price'].replace( u'£', '').replace(' ', '').replace( ',', '').strip()) img_url = response.xpath( "//ul[@id='galleryImages']/li[1]/a/@href").extract() if img_url: cond_set_value(product, 'image_url', urlparse.urljoin(response.url, img_url[0])) cond_set_value( product, 'description', self.clear_desc( response.xpath( "//div[@id='bopBottom']" "//h2[@class='bopSectionHeader' and text()[1]='Product Description'][1]" "/following-sibling::*[@class='bopSection']" "//text()").extract())) cond_set_value(product, 'locale', "en_GB") regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, "reseller_id", reseller_id) cond_set( product, 'brand', response.xpath( "string(//div[@id='bopBottom']//*[@itemprop='brand'])"). extract(), string.strip, ) return product
def _scrape_product_links(self, response): # To populate the description, fetching the product page is necessary. if self.user_agent_key not in ["desktop", "default"]: links = response.xpath( '//section[contains(@class,"product_listed")]' '//div[contains(@class,"product_info")]//a/@href').extract() if not links: self.log( "[Mobile] Found no product data on: %s" % response.url, ERROR) for link in links: yield urlparse.urljoin(response.url, link), SiteProductItem() else: url = response.url # This will contain everything except for the URL and description. product_jsons = response.xpath( '//meta[@name="productdata"]/@content').extract() if not product_jsons: self.log("Found no product data on: %s" % url, ERROR) product_links = response.css( ".product > .desc > h2 > a ::attr('href')").extract() if not product_links: self.log("Found no product links on: %s" % url, ERROR) for product_json, product_link in zip(product_jsons[0].split('|'), product_links): prod = SiteProductItem() cond_set_value(prod, 'url', urlparse.urljoin(url, product_link)) product_data = json.loads(product_json) cond_set_value(prod, 'price', product_data.get('price')) cond_set_value(prod, 'image_url', product_data.get('mediumImage')) #prod['upc'] = product_data.get('productId') if prod.get('price', None): prod['price'] = Price(price=str(prod['price']).replace( ',', '').strip(), priceCurrency='GBP') try: brand, title = self.brand_from_title(product_data['name']) cond_set_value(prod, 'brand', brand) cond_set_value(prod, 'title', title) except KeyError: raise AssertionError( "Did not find title or brand from JS for product: %s" % product_link) yield None, prod
def _unify_price(self, product): price = product.get('price') if not price: return price_match = re.search('\$ *([, 0-9]+(?:\.[, 0-9]+)?)', price) if price_match: price = price_match.group(1) price = ''.join(re.split('[ ,]+', price)) cond_replace_value(product, 'price', Price('USD', price))
def _parse_price(self, response): price = response.xpath('//*[@itemprop="price"]/text()').re('[\d\.]+') currency = response.xpath( '//*[@itemprop="priceCurrency"]/@content').re('\w{2,3}') or ['USD'] if not price: return None return Price(price=price[0], priceCurrency=currency[0])
def _parse_price(product_info): currency = 'USD' price_raw = product_info.get('CurrentPrice', '') price = FLOATING_POINT_RGEX.findall(price_raw) if not price or 'for' in price_raw: price_raw = product_info.get('RegularPrice', '') price = FLOATING_POINT_RGEX.findall(price_raw) price = float(price[0]) if price else 0.0 return Price(price=price, priceCurrency=currency)
def _parse_price(self, response): price = is_empty( response.xpath( '//p[@class="special-price"]/span[@itemprop="price"]/text()' ' |//span[@class="regular-price"]/span[@itemprop="price"]/text()' ).extract(), 0.00) if price: price = is_empty(re.findall(r'(\d+\.\d+)', price)) return Price(price=price, priceCurrency='GBP')
def _parse_price(self, response): price = response.xpath('//*[@class="price"]/text()').re('[\d\.\,]+') if not price: price = response.xpath('.//*[@itemprop="price"]/@content').re( '[\d\.\,]+') if not price: return None price = price[0].replace(',', '') return Price(price=price, priceCurrency='USD')
def _parse_price(response): dell_price = response.xpath('//*[contains(text(), "Dell Price")]') dell_price = re.search( '\$([\d,]+\.\d+)', ''.join(dell_price.xpath('./..//text()').extract())) if dell_price: dell_price = dell_price.group(1) price = Price(price=dell_price, priceCurrency='USD') return price price = response.xpath('//*[contains(@name, "pricing_sale_price")]' '[contains(text(), "$")]//text()').extract() if not price: price = response.xpath( '//*[contains(@name, "pricing_retail_price")]' '[contains(text(), "$")]//text()').extract() if price: price = Price(price=price[0].strip().replace('$', ''), priceCurrency='USD') return price
def _populate_variants(self, response, product, variants): variants = response.meta.get('variants', {}) if variants is None or len(variants) == 0: return product for k, v in variants.items(): if 'sizes' in v: continue url = v['href'] new_meta = response.meta.copy() request = Request(url, callback=self._parse_variants_cb, meta=new_meta) return request if len(variants) == 1: k = list(variants)[0] sizes = variants[k]['sizes'] if len(sizes) == 0: return product prodlist = [] for color, v in variants.items(): image_url = variants[color]['image_url'] if len(v['sizes']) == 0: new_product = product.copy() new_product['model'] = color new_product['price'] = v['price'] if not '£' in new_product['price']: self.log('Unknown currency at %s' % response.url) else: new_product['price'] = Price( price=new_product['price'].replace(',', '').replace( '£', '').strip(), priceCurrency='GBP') new_product['image_url'] = image_url stock = v['stock'] if stock == '0': new_product['is_out_of_stock'] = True prodlist.append(new_product) else: for size, sizeattrs in variants[color]['sizes'].items(): price = sizeattrs['price'] stock = sizeattrs['stock'] # print color, size, price, stock, image_url new_product = product.copy() new_product['model'] = color + ":" + size new_product['price'] = price new_product['image_url'] = image_url if stock == '0': new_product['is_out_of_stock'] = True if 'code' in sizeattrs: try: new_product['upc'] = int(sizeattrs['code']) except ValueError: pass prodlist.append(new_product) return prodlist
def _parse_price(self, response): try: price = min( map((float), re.findall('"finalPrice":"([\d\.]+)"', response.body))) return Price(price=price, priceCurrency='USD') except: import traceback print traceback.print_exc() return None
def _parse_price(self, response, item): product = response.meta['product'] price_sel = item.css('.Price') if price_sel: price = is_empty(price_sel.extract()).strip() price = is_empty(re.findall(r'(\d+)', price)) product['price'] = Price(priceCurrency="GBP", price=price) else: product['price'] = None
def parse_product(self, response): product = response.meta['product'] vid = 1 if "vid" in response.meta: vid = response.meta['vid'] if 'OutOfStockNoResults' in response.url: self.log("Product OutOfStock %s %s" % (response.url, product), DEBUG) return if not product.get("price"): price = is_empty( response.xpath( "//span[@id='priceText']/text() |" \ "//div[@id='tabWindow']/noscript" ).extract(), "" ) price = is_empty(re.findall("\d+\.\d+", price)[::-1]) if price: product["price"] = Price(price=price, priceCurrency="USD") title = product.get('title') if isinstance(title, str): product['title'] = title.decode('utf-8', 'ignore') title = product.get('title') else: title = is_empty( response.xpath( "//div[@id='productNameText']/h1/text()").extract()) if title: product["title"] = title brindex = title.find("™") if brindex > 1: brand = title[:brindex] cond_set_value(product, 'brand', brand) # print "BRAND=", brand cond_set_value(product, 'brand', self.BRAND) cond_set(product, 'description', response.xpath("//div[@id='tabWindow']").extract()) product['locale'] = "en-US" new_meta = response.meta.copy() pid = product.get('upc') if not pid: pid = re.findall("pid=(\d+)", response.url) if pid: pid = pid[0] url = self.PRODUCT_URL_JS.format(pid=pid, vid=vid) return Request(url, callback=self._parse_product_js, meta=new_meta, priority=100)