def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//ol[@id="products-list" and @class="products-list"]//li[contains(@class,"item")]') for product in products: product_loader = ProductLoader(item=Product(), selector=product) url = product.select(u'.//h2[@class="product-name"]/a/@href').extract() url = urljoin_rfc(get_base_url(response), url[0]) product_loader.add_value('url', url) sku = product.select(u'.//small[child::b[contains(text(),"Product Code:")]]/text()').extract() if sku: sku = sku[0].strip()[3:] if sku in self.skus: product_loader.add_value('sku', sku) name = product.select(u'.//h2[@class="product-name"]/a/text()').extract()[0].strip() pack_size = product.select(u'.//small[child::b[contains(text(),"Pack Size:")]]/text()').extract() if pack_size: name += u' x' + pack_size[0].strip() + u'u.' product_loader.add_value('name', name) price = product.select(u'.//div[@class="price-box"]/span[contains(@class,"regular-price")]/span[@class="price"]/text()').re(u'[\d\.,]+') price = re.sub(',', '', price[0]) product_loader.add_value('price', price) yield product_loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href') if nextPageLink: yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products) products = hxs.select('//div[@id="center-main"]//div[@class="details"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", "a/text()") loader.add_xpath("sku", 'div[@class="sku"]/span/text()') # few prices were under div class desc price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()') if price_selector: price = price_selector[0].extract() else: price = "0.0" loader.add_value("price", price) relative_url = product.select("a/@href")[0].extract() loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url)) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse_products(self, hxs, response): print response.encoding model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Model"]/preceding-sibling::*) + 1').extract() description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Description"]/preceding-sibling::*) + 1').extract() price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Price"]/preceding-sibling::*) + 1').extract() if model_pos and description_pos and price_pos: model_pos = model_pos[0].split('.')[0] description_pos = description_pos[0].split('.')[0] price_pos = price_pos[0].split('.')[0] products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \ and not(text()="Model")]/..' % model_pos) for product in products: loader = ProductLoader(selector=product, item=Product()) url = response.url model_url = product.select('.//td[starts-with(@class, "orderinfo") \ and position()=%s]//a/@href' % model_pos).extract() if model_url: url = urljoin_rfc(get_base_url(response), model_url[0]) loader.add_value('url', url) loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos) loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos) if not loader.get_output_value('price') or not loader.get_output_value('name').strip(): continue yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()') loader.add_value("url", response.url) loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()') loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()') yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) search_sku = response.meta['sku'] hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) name_xpaths = [u'//font[contains(@class,"productname")]/big/text()', u'//font[contains(@class,"productname")]/text()'] for name_xpath in name_xpaths: main_name = hxs.select(name_xpath).extract() if main_name: main_name = main_name[0].strip() break if not main_name: main_name = response.url main_name = re.search(u'.*/(.*)\.htm', main_name) if main_name: main_name = main_name.groups()[0] + u' (%s)' % search_sku options = hxs.select(u'//td//text()').re(u'PURCHASE OPTIONS: (.*)') if options: main_name += u' %s' % options[0].strip() loader.add_value('name', main_name) loader.add_xpath('price', u'//td//font[contains(@class,"pricecolor") and not(ancestor::table[contains(@id,"related")])]/text()') loader.add_value('sku', search_sku) sku = hxs.select(u'//span[@class="product_code"]/text()').extract() if sku: sku = re.sub('-', '', sku[0]) if sku.startswith(search_sku): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) mpn = ''.join(hxs.select('//*[@id="product-information"]/table/tr[th/text()="Part number"]/td/span/text()').extract()).strip() loader.add_value('identifier', mpn) loader.add_value('name', ' '.join((response.meta['name'].strip(), mpn))) loader.add_value('url', response.url) loader.add_xpath('price', '//*[@id="product-price"]/p[@class="no-vat"]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', response.meta['name']) price = hxs.select(u'//div[@id="purchaseProc"]//span/text()').extract()[0] loader.add_value('price', price.replace('.', '').replace(',', '.')) loader.add_value('sku',response.meta['sku']) loader.add_value('url',response.url) yield loader.load_item()
def parse_product(self, response): loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@class="productName fn"]/text()') loader.add_xpath('price', '//li[@class="price"]//text()') loader.add_xpath('sku', '//div[starts-with(@class, "specificationContent")]' + '//td[contains(text(), "Manufacturer ID")]/following-sibling::td/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', response.meta['name']) price = hxs.select(u'//table[@class="produktdetails"]//tr/td[preceding-sibling::td[contains(text(),"Cena")]]/text()').extract()[0] loader.add_value('price', price.replace(',', '.')) loader.add_value('sku',response.meta['sku']) loader.add_value('url',response.url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) mpn = ''.join(hxs.select('//div[@class="span-4 productcolumn productleftcol"]/h4[text()="Manufacturers Part No:"]/span/text()').extract()) loader.add_value('identifier', mpn) loader.add_value('url', response.url) loader.add_value('name', ' '.join((response.meta['name'], mpn))) loader.add_value('price', response.meta['price']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_xpath('name', '//h1[@id="product_description"]/text()') product_loader.add_value('price', hxs.select('//p[@id="product_price"]/span/text()').re('(\d+(?:\.\d+))')[0]) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_xpath('name', '//h1[@id="partNameId"]/text()') loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip() loader.add_value('sku', sku) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) if self.products.has_key(response.url): sku = self.products[response.url] loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()') loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()') return loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()') product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath("name", "//h1/text()") loader.add_value("url", response.url) loader.add_xpath("price", '//span[@id="product_price"]/text()') loader.add_xpath("sku", '//td[@id="product_code"]/text()') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//span[@id="ProductDetail1_lblDescription"]//text()').extract() if name: loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_xpath('price', '//*[@class="yourPriceText"]//text()') loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector() with open(os.path.join(HERE, 'bosch_uk_diy.csv')) as f: reader = csv.DictReader(f) for row in reader: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('name', unicode(row['name'],'utf-8')) loader.add_value('price',row['price']) loader.add_value('sku',row['sku']) loader.add_value('url',row['bosch']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//h1[@itemprop="name"]/text()') price = hxs.select(u'//span[@itemprop="price"]/text()').extract()[0].replace(',', '.') loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@class="buying"]/h1[@class="parseasinTitle"]/span[@id="btAsinTitle"]/text()') price = hxs.select(u'//div[@class="buying"]/table[@class="product"]//b[@class="priceLarge"]/text()').extract()[0] loader.add_value('price', price.replace(',', '.')) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) items = hxs.select('//table[@class="pricecart"]//tr') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_xpath('name', './/span[@class="spanDescription"]/text()') loader.add_value('url', response.url) loader.add_value('price', item.select('.//td[@class="cellPrice"]/text()').re('Our Price\s+.?(\d+(?:\.\d+))')[0]) loader.add_value('sku', item.select('.//td[@class="cellAddToCart"]/a/@href').re('pid=([0-9a-f]+)')[0]) yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//table[@id="productCategoriesTable"]//tbody//tr') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/a/strong/text()') url = product.select('.//a/strong/../@href').extract()[0] loader.add_value('url', urljoin_rfc(get_base_url(response), url)) if product.select('.//span[@class="red"]/strong/text()'): loader.add_xpath('price', './/span[@class="red"]/strong/text()') else: loader.add_value('price', '0') yield loader.load_item()
def parse_course(self, response): hxs = HtmlXPathSelector(response) path = response.meta['path'][:] path.extend(hxs.select(u'//h1/text()').extract()) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('name', u' / '.join((p.strip() for p in path))) product_loader.add_value('url', response.url) costs = hxs.select(u'//div[@id="kostenspecificatie"]') price1 = costs.select(u'.//tr/td[contains(text(),"Cursusgeld")]/../td[position()=2]/text()').extract() price2 = costs.select(u'.//tr/td[contains(text(),"Studiemateriaal")]/../td[position()=2]/text()').extract() pricetxt = costs.select(u'./p/text()').extract() # Just because one course has price in DIVs not single P pricetxt.extend(costs.select(u'./div/text()').extract()) if not price1 or not price2: for line in pricetxt: if 'Cursusgeld' in line: price1 = [line.split(':')[1].split('(')[0]] # Just because one course specifies price with multiple P tags elif 'Module C1 en C2' in line: price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)] elif 'Inschrijfgeld' in line: price1 = [line.split(':')[1].split('(')[0]] elif 'Trainingskosten' in line: price1 = [line.split(':')[1].split('(')[0]] elif 'Studiemateriaal' in line: price2 = [line.split(':')[1].split('(')[0]] elif 'Trainingsmateriaal' in line: price2 = [line.split(':')[1].split('(')[0]] if not price1: for line in pricetxt: line = line.strip() if line.startswith(u'\u20ac'): price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)] elif line.startswith('20') and line[4] == ':': price1 = [line.split(':')[1]] elif line.startswith('Opleiding'): price1 = [re.search(u'([\d.,]+)', line.split(u'\u20ac')[1]).group(1)] try: # This seems to be optional if not price2: price2 = ['0'] price = float(price1[0].replace(u'\u20ac', '').replace('-', '').replace('.', '').replace(',', '.')) \ + float(price2[0].replace(u'\u20ac', '').replace('-', '').replace('.', '').replace(',', '.')) except Exception, e: logging.error('Bad price [%s] (%s)' % (pricetxt, e))
def parse_product(self, response): hxs = HtmlXPathSelector(response) price = hxs.select('//h1/span[@class="productSpecialPrice"]/text()').extract() if(not price): price = hxs.select('//td[@align="right"]/h1/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//td[@valign="top" and not(@align="right")]/h1/text()') loader.add_value('url', response.url) loader.add_value('price', price[0]) loader.add_xpath('sku', '//input[@name="products_id"]/@value') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) #loader.add_value('sku', response.meta['mpn']) mpn = hxs.select('//div[@class="prod_info_container"]/h1/i/text()').extract() if not mpn: mpn = hxs.select('//li/span[@itemprop="identifier"]/text()').extract() name = ' '.join((response.meta['name'], mpn[0])) loader.add_value('identifier', mpn[0]) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_xpath('price', '//td[@class="radioPadding" and @width="90" and @bgcolor="#f2f2f2" and @align="center"]/text()') yield loader.load_item()
def parse_node(self, response, node): if not isinstance(response, XmlResponse): return loader = ProductLoader(item=Product(), selector=node) loader.add_xpath('url', u'./product-url/text()') loader.add_xpath('name', u'./title/text()') price = node.select(u'./price/text()').extract()[0].replace(',', '.') loader.add_value('price', price) log.msg(json.dumps({'name': loader.get_output_value('name'), 'price': price})) if loader.get_output_value('price'): return loader.load_item() else: return Product()
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse_designer(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) products = hxs.select('//table//tr[descendant::a]') for product in products: loader = ProductLoader(item=Product(), selector=product) url = product.select('.//td[@valign="Middle"]/a[contains(@href,"Product")]/@href')[0] loader.add_value('url', urljoin_rfc(base_url, url.extract())) loader.add_xpath('name', './/td[@valign="Middle"]/a/span/text()') loader.add_xpath('price', './/td/p/b/text()') # sku = product.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)') loader.add_value('sku', url.re('id=(\d+)')[0]) yield loader.load_item()
def parse_cat(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product_divs = hxs.select('//div[@class="product-info"]') for product in product_divs: url = product.select('.//a[@class="product-title"]/@href').extract()[0]; loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/a[@class="product-title"]/text()') loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_xpath('price', './/span[@class="price"]/span[@id]/text()') loader.add_xpath('sku', './/p[@class="sku"]//span[contains(@id,"product_code")]/text()') yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select("//div[@class='box-caracteristic-search']/div[@class='table-wrap']/form/table/tbody/tr") for product in products: name = product.select("td[@class='prd-details']/h3/a/text()").extract() if not name: logging.error("ERROR! No name! %s" % response.url) continue name = name[0] url = product.select("td[@class='prd-details']/h3/a/@href").extract() if not url: logging.error("ERROR! NOT FOUND URL! URL: %s. NAME: %s" % (response.url, name)) continue url = url[0] url = self._urljoin(response, url) price = product.select("td[@class='prd-amount-details']/div/p[@class='prd-amount']/strong/text()").extract() if not price: logging.error("ERROR! NOT FOUND PRICE! URL: %s. NAME: %s" % (response.url, name)) continue price = price[0] l = ProductLoader(item=Product(), response=response) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse_product(self, response): base_url = get_base_url(response) loader = ProductLoader(response=response, item=Product()) url = response.url loader.add_value('url', urljoin_rfc(base_url, url)) identifier = url.split('/')[4] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) image_url = response.xpath( '//div[@class="prodImg-main"]//img[@class="prodImg"]/@src' ).extract_first() loader.add_value('image_url', image_url) name = response.xpath('//h1[@itemprop="name"]/text()').extract()[0] loader.add_value('name', name) price = extract_price( response.xpath('//meta[@itemprop="price"]/@content').extract()[0]) loader.add_value('price', price) if price < 50: loader.add_value('shipping_cost', 2.5) categories = response.meta.get('categories') categories = response.css('ul.breadcrumbs span::text').extract()[1:-1] loader.add_value('category', categories) product = loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_data = json.loads( hxs.select( '//script[contains(text(), "walPP.variantDataRawArr")]/text()' ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0] except: self.errors.append('WARNING: No product data in %s' % response.url) return price = product_data.get(u'price_store_price', None) if not price: browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK') time.sleep(5) hxs = HtmlXPathSelector(text=browser.page_source) browser.quit() # Monitor all products even without a price (as requested in #248) price = '.'.join( hxs.select( '//div[@id="pricing"]/div[@class="price-main"]//text()'). re(r'(\d+)')).strip() if not price: price_elem = hxs.select( '//span[@id="store-price"][1]/text()').extract() if price_elem: price = price_elem[0] if not price: store_prices = hxs.select( '//div[contains(@id, "store-")]//div[@class="price"]//text()' ).extract() try: price = '.'.join( re.findall(r'(\d+)', '.'.join(store_prices[:3]))) except: price = '0.00' else: price = price[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('category', product_data[u'Category']) product_loader.add_value('name', product_data[u'prod_name_en']) product_loader.add_value('sku', product_data[u'P_RollupKey']) product_loader.add_value('price', price.replace(',', '')) product_loader.add_value('identifier', product_data[u'P_UniqueKey']) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product # the same as canadiantire.ca # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> try: part2 = product['sku'] except: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: if not part2: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2 yield Request(reviews_url, meta=response.meta, callback=self.parse_reviews)
def parse_product(self, response): # inspect_response(response, self) # return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) if not hxs.select('//select[@id="customerTaxType"]/option[@selected="selected"]').re('Excl'): url = hxs.select('//select[@id="customerTaxType"]/option[not (@selected)]/@value').extract() yield Request(urljoin(base_url, url[0]), callback=self.parse_product, dont_filter=True, meta=response.meta) return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('category', response.meta['category']) name = '' tmp = hxs.select('//h1[@itemprop="name"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) tmp = hxs.select('//div[@class="gallery"]//a[1]/@href').extract() if tmp: loader.add_value('image_url', tmp[0]) # Find brand. for brand in self.brands: if brand.lower() in name.lower(): loader.add_value('brand', brand) break # p = loader.load_item() tmp = hxs.select('//input[contains(@id,"add-to-cart-button-")]/@data-productid').extract() if tmp: # identifier = product['identifier'] loader.add_value('identifier', tmp[0]) tmp = hxs.select('//p/span[strong="Product Code:"]/text()').extract() if tmp: loader.add_value('sku', tmp[0].strip()) tmp = hxs.select('//span[@itemprop="price"]/text()').extract() if tmp: price = extract_price(tmp[0].strip().split()[0]) loader.add_value('price', price) product = loader.load_item() url_post = 'http://www.northseaworkwear.com/addproducttocart/details/%s/1' % product['identifier'] qty = '1' tmp = hxs.select('//input[contains(@class,"qty-input")]/@value').extract() if tmp: qty = tmp[0] selections = hxs.select('//div[@class="attributes"]//select') if not selections: # loader.add_value('stock', 0) # yield loader.load_item() formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty} self.cookie_num += 1 yield FormRequest(url_post, formdata=formdata, meta={'item':product, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock) return attrs = [] for sel in selections: attr_name = '' tmp = sel.select('@name').extract() if tmp: attr_name = tmp[0] attr_values = [] for option in sel.select('option'): value = '' tmp = option.select('@value').extract() if tmp: value = tmp[0] txt = '' tmp = option.select('text()').extract() if tmp: txt = tmp[0].strip() if value != '' and value != '0': attr_values.append((attr_name, value, txt)) attrs.append(attr_values) # print '### Selections:', attrs for option in itertools.product(*attrs): # print '### option:', o item = copy.deepcopy(product) item['name'] += ' - ' + '-'.join([attr[2] for attr in option]) item['identifier'] += '-' + '-'.join([attr[1] for attr in option]) # yield item formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty} for attr in option: formdata[attr[0]] = attr[1] # print 'formdata:', formdata self.cookie_num += 1 yield FormRequest(url_post, formdata=formdata, meta={'item':item, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)
def parse_product(self, response): hxs = HtmlXPathSelector(response) brands = set( hxs.select( '//div[p[contains(span/text(), "Via m")]]/ul/li/a/text()'). extract()) loader = ProductLoader(item=Product(), response=response) price = hxs.select( '//p[contains(@class, "final-price")]/span[@class="bold"]/text()' ).extract()[0] price = extract_price(price) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/span/text()') loader.add_value('category', response.meta['category']) brand = '' for b in brands: if loader.get_output_value('name').upper().startswith(b.upper()): brand = b break loader.add_value('brand', brand) identifier = url_query_parameter(response.url, "ProductID") loader.add_value('sku', identifier) loader.add_value('identifier', identifier) image_url = hxs.select('//a[@id="Zoomer"]//img/@src').extract() image_url = urlparse.urljoin(get_base_url(response), image_url[0]) if image_url else '' loader.add_value('image_url', image_url) yield loader.load_item()
def parse_product(self, response): product = json.loads(response.body) url = response.meta['url'] category = product['category'] brand = product['brand'] name = product['title'] for option_desc, option in product['variants'].iteritems(): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('identifier', option['id']) product_loader.add_value('sku', option['id']) product_loader.add_value('image_url', option['imageUrl']) product_loader.add_value('name', name + ' ' + option['title']) product_loader.add_value('url', url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('price', option['salesPrice']) product_loader.add_value('stock', option['stock']) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') loader.add_xpath( 'sku', 'normalize-space(substring-after(//div[@class="sku"]/text(),":"))') loader.add_value('category', response.meta.get('category')) loader.add_value( 'price', extract_price_eu(''.join( hxs.select( '//p[@class="special-price"]//span[@class="price"]/text()' ).extract()))) if not loader.get_output_value('price'): loader.add_value( 'price', extract_price_eu(''.join( hxs.select('//span[@class="price"]/text()').extract()))) loader.add_value('stock', 1) img = hxs.select( '//div[@class="product-image-gallery"]//img/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) product = loader.load_item() options = hxs.select('//a[@data-productid]') if options: for o in options: p = Product(product) p['name'] += ' ' + o.select('./@title').extract()[0] p['identifier'] = o.select('./@data-productid').extract()[0] yield p else: product['identifier'] = hxs.select( '//*[@data-product-id]/@data-product-id').extract()[0] yield product
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row["Unique Product ID"]) loader.add_value('sku', row["Product code"]) loader.add_value('category', unicode(row["Category"].decode('ISO-8859-1'))) loader.add_value('name', unicode(row["Product name"].decode('ISO-8859-1'))) loader.add_value('price', row["Price"]) loader.add_value('url', row["Product page URL"]) loader.add_value('brand', unicode(row["Brand"].decode('ISO-8859-1'))) loader.add_value('image_url', row['Image URL']) out_of_stock = row['Stock availability'].upper() != 'IN STOCK' if out_of_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): json_data = None if 'new Product.OptionsPrice(' in response.body: d = response.body.split('new Product.OptionsPrice(', 1)[1].split(');', 1)[0] json_data = json.loads(d) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) sku = response.xpath('//input[@name="product"]/@value').extract() if sku: sku = sku[0] if json_data and json_data.get('productId', None): sku = json_data['productId'] if not sku: self.log('WARNING: No product ID => %s' % response.url) return loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') if json_data: price = str(json_data.get('productPrice', '')) else: price = response.xpath( '//span[@class="price"]/text()').extract()[0] if price: loader.add_value('price', price) loader.add_value('stock', 1) else: loader.add_value('price', '0.0') loader.add_value('stock', 0) image_url = response.xpath('//img[@id="image"]/@src').extract() if image_url: loader.add_value('image_url', response.urljoin(image_url[0])) brand = response.xpath( '//div[@class="product-name"]/h2/a/text()').extract() if brand: loader.add_value('brand', brand[0]) categories = response.xpath( '//div[@class="breadcrumbs"]/ul/li/a/text()').extract() if len(categories) > 1: loader.add_value('category', categories[1:]) product = loader.load_item() options = response.xpath('//ul[@id="color-swatch-attribute-92"]/li') if not options: if not product.get('identifier', None): self.log('WARNING: No product ID => %s' % response.url) else: if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product else: self.log('WARNING: Duplicate product ID => %s' % response.url) return # process options for sel in options: item = Product(product) opt_id = sel.xpath('@class').extract() if opt_id: item['identifier'] += '-' + opt_id[0].split()[0].split('-')[-1] opt_desc = filter( lambda s: s != '', map( unicode.strip, sel.xpath('div[@class="tool-tip-description"]/text()'). extract())) if not opt_desc: opt_desc = filter( lambda s: s != '', map( unicode.strip, sel.xpath( 'div[@class="tool-tip-description"]/strong/text()' ).extract())) if opt_desc: item['name'] = product['name'] + ' - ' + ''.join(opt_desc) if not item.get('identifier', None): self.log('WARNING: No product ID => %s' % response.url) else: if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item else: self.log('WARNING: Duplicate product ID => %s' % response.url)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories = hxs.select('//div[@id="listaSezioni"]/div/a/@href').extract() for category in categories: cat_url = urljoin_rfc(base_url, category) yield Request(cat_url) sub_categories = hxs.select('//div[@class="contentGruppi"]/div/div[@class="nome"]/a/@href').extract() for sub_category in sub_categories: cat_url = urljoin_rfc(base_url, sub_category) yield Request(cat_url) products = hxs.select('//div[@class="articolo"]') if products: for product in products: l = ProductLoader(item=Product(), selector=product) #l.add_xpath('name', 'h2/a/b/text()') url = product.select('.//h2/a/@href').extract() url = urljoin_rfc(base_url, url[0]) l.add_value('url', url) l.add_value('identifier', re.search('art/(\d+)_', url).group(1)) l.add_xpath('sku', 'p[@class="codfor"]/strong/text()') l.add_xpath('brand', 'p[@class="marca"]/img/@alt') image_url = product.select('div[@class="img"]/a/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' l.add_value('image_url', image_url) category = hxs.select('//div[@class="gruppo"]/text()').extract()[0].strip() l.add_value('category', category) price = product.select('p[@class="prezzo"]/text()').extract() price = extract_price_eu(price[-1]) if price else 0 l.add_value('price', price) if price<=0: l.add_value('stock', 0) item = l.load_item() yield Request(item['url'], callback=self.parse_product, meta={'item': item}) next = hxs.select('//a[@class="next"]').extract() if next: yield Request(urljoin_rfc(base_url, next[-1]))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) product_name = hxs.select('//div[@class="boxbody"]/h1/text()[normalize-space()]').extract() if not product_name: retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) product_price = hxs.select('//div[@class="price"]/ins/b/text()').extract() product_price = product_price[0] if product_price else None if not product_price: product_price = re.search('Price=(.*)', response.body) if product_price: product_price = product_price.group(1).replace('.', '') else: retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) image_url = hxs.select('//a[@class="img"]/@href').extract() out_of_stock = hxs.select('//li[@class="serpontunactive"]') loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@class="boxbody"]/h1/text()[normalize-space()]') loader.add_value('url', response.url) loader.add_xpath('sku', '//*', re=r'ProductNo=(.*)') loader.add_xpath('identifier', '//*', re=r'ProductID=(.*)') if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_xpath('category', '//li[@class="current"]/a/text()', lambda e: e[0] if e else '') product_price = extract_price(product_price.replace('.', '').replace(',', '.')) loader.add_value('price', product_price) loader.add_xpath('brand', '//*', lambda e: e[0] if e else '', re=r'Trademark=(.*)') item = loader.load_item() if not item.get('sku') or not item.get('name'): retried = response.meta.get('retried', False) if not retried: yield Request(response.url, dont_filter=True, meta={'retried': True}, callback=self.parse_product) return if not item.get('price'): item['stock'] = 0 yield item
def parse_product(self, response): row = response.meta['row'] name = response.xpath( '//h2[@itemprop="name"]/text()').extract()[0].strip() colour = response.xpath( '//p[@class="common-option variant-ctrl"]/text()').extract() if colour: name += ' ' + colour[0].strip() image_url = response.xpath('//img[@itemprop="image"]/@src').extract() image_url = 'http:' + image_url[0] if image_url else '' price = ''.join( response.xpath( '//div[contains(@class, "product-price")]/span[contains(@class, "current")]//text()' ).extract()) price = extract_price(price) if price else '' loader = ProductLoader(response=response, item=Product()) loader.add_xpath('identifier', '//div[@id="pid"]/@data-product-id') loader.add_value('sku', row['SKU']) loader.add_value('url', response.url) loader.add_value('image_url', image_url) loader.add_xpath('brand', '//h2[@itemprop="brand"]/a/text()') categories = response.xpath( '//ul[@id="breadcrumbs"]//span/text()').extract() loader.add_value('category', categories) loader.add_value('name', name) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//div[@class="product-title"]/h1/text()').extract() if not name: self.log('ERROR: no product NAME found! URL:{}'.format( response.url)) else: loader.add_value('name', name[0].strip()) prod_id = hxs.select('//input[@id="productId"]/@value').extract()[0] loader.add_value('identifier', prod_id) loader.add_value('url', response.url) price = hxs.select( '//div[@class="price clearfix"]/div[@class="floatleft block"]/span/text()' ).extract() if not price: price = hxs.select( '//script[contains(text(), "product_base_price")]').re( 'product_base_price:\["(.*)"\]') if not price: self.log('ERROR: no product PRICE found! URL:{}'.format( response.url)) return if price: loader.add_value('price', price[0]) product_image = hxs.select('//a[@id="mainImage"]/img/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) loader.add_value('category', response.meta.get('category', '')) sku = hxs.select('//input[@name="skuId"]/@value').extract() if not sku: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) else: loader.add_value('sku', sku[0].strip()) brand = re.search('product_brand:\[\"(.*)\"\],', response.body) if brand: loader.add_value('brand', brand.group(1).strip()) promo = response.xpath( '//div[contains(@class,"pdp_add-cart")]/div[@class="truuk-offer-box"]' '//span[@class="truuk-special-offer-body"]/text()').extract() if not promo: promo = response.xpath( '//div[contains(@class,"pdp_add-cart")]//span[@class="was-2 block"]/text()' ).extract() product = loader.load_item() reviews_url = u'http://www.toysrus.co.uk/assets/pwr/content/%s/%s-en_GB-1-reviews.js' % ( self.calculate_url(prod_id), prod_id) metadata = ToyMonitorMeta() metadata['reviews'] = [] if promo: metadata['promotions'] = promo[0] product['metadata'] = metadata meta = { 'dont_retry': True, 'handle_httpstatus_list': [404, 302], 'cur_page': 1, 'product': product, 'dont_redirect': True, 'reviews_url': u'http://www.toysrus.co.uk/assets/pwr/content/' + u'%s/%s' % (self.calculate_url(prod_id), prod_id) + u'-en_GB-%s-reviews.js' } yield Request(reviews_url, meta=meta, callback=self.parse_review)
def parse_product(self, response): base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) sku = response.xpath( '//div[@id="productInfo"]//dt[@id="About"]/i/text()').extract( )[-1].strip() if not sku: return loader.add_value('sku', sku) loader.add_value('identifier', sku) brand = response.xpath('//span[@id="varum"]/text()').extract_first() if not brand: brand = response.xpath('//span[@class="brand"]/text()').extract() loader.add_value('brand', brand) name = response.xpath('//b[@itemprop="name"]/text()').extract_first() loader.add_value('name', name) loader.add_value('url', response.url) price = response.xpath( '//span[@id="PrisFalt"]/meta[@itemprop="price"]/@content' ).extract_first() price_before = response.css( '.price-rek span#rekPris::text').extract_first() if price_before and Decimal(price_before) > Decimal(price): sales_price = price else: sales_price = None loader.add_value('price', price) image_url = response.css('img#produktbild::attr(src)').extract_first() if not image_url: image_url = response.xpath( '//div[@class="product-image"]/img/@src').extract_first() image_url = response.urljoin(image_url) if image_url else '' loader.add_value('image_url', image_url) categories = response.css('span.breadcrumb a::text').extract()[-3:] loader.add_value('category', categories) out_stock = response.xpath( u'//div[@class="artikel_i_lager"]//span[contains(text(), "Slutt på lager")]' ) if out_stock: loader.add_value('stock', 0) item = loader.load_item() if sales_price: item['metadata'] = {'SalesPrice': extract_price(sales_price)} options = response.css('div.WrapVar') if options: if sales_price: self.logger.warning('Sales price and options on the %s' % response.url) for option in options: option_item = deepcopy(item) identifier = option.xpath('.//@id').re('VarList(.*)')[0] option_item['identifier'] += '-' + identifier price = option.css('div.PT_Pris::text').extract() if price: option_item['price'] = extract_price(price[0]) name = option.xpath('@variant-name').extract_first() if name: option_item['name'] += ' ' + name image_url = response.xpath('//img[contains(@src, "' + identifier + '")]/@src').extract() if image_url: option_item['image_url'] = urljoin_rfc( get_base_url(response), image_url[0].split('img=')[-1]) if image_url else '' stock_data = re.findall('var rubrikartikel = (.*);', response.body) if stock_data: stock_data = json.loads(stock_data[0]) for stock in stock_data['varianter']: if stock['artnr'] == identifier: option_item['stock'] = stock['saldo'] break yield option_item else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select( '//a[contains(@class,"size-boxes")]/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product) product_category = hxs.select( '//div[contains(@class,"breadcrumbs")]/ul/li/a/text()').extract( )[-1].strip() product_name = hxs.select('//h1/text()').extract()[0] product_image = hxs.select('//a[@id="zoom-btn"]/@href').extract() if product_image: product_image = urljoin_rfc(get_base_url(response), product_image[0]) product_brand = hxs.select('//img[@class="man-logo"]/@alt').extract() product_brand = product_brand[0] if product_brand else '' product_sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract() product_sku = product_sku[0] if product_sku else '' product_config_reg = re.search( 'var spConfig = new Product.Config\((\{.*\})\);', response.body) product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] if product_config_reg: products = json.loads(product_config_reg.group(1)) for identifier, product in products['childProducts'].items(): product_loader = ProductLoader(item=Product(), response=response) if identifier: product_loader.add_value( 'identifier', product_identifier + '-' + identifier) else: product_loader.add_value('identifier', product_sku) product_loader.add_value('price', product[u'finalPrice']) option_name = product_name for attr_id, attribute in products[u'attributes'].items(): for option in attribute['options']: if identifier in option['products']: option_name += ' ' + option['label'] product_loader.add_value( 'name', re.sub(r' \((.+?)\)', r'', option_name)) product_loader.add_value('sku', product_sku) product_loader.add_value('url', response.url) product_loader.add_value('brand', product_brand) product_loader.add_value('category', product_category) product_loader.add_value('image_url', product_image) if identifier: yield Request('http://www.bedworld.net/oi/ajax/co/?id=' + identifier + '&pid=' + product_identifier, meta={'item': product_loader.load_item()}, callback=self.parse_options) else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', product_name)) product_loader.add_value('sku', product_sku) product_loader.add_value('brand', product_brand) product_loader.add_value('identifier', product_identifier) product_loader.add_value('url', response.url) product_loader.add_value('category', product_category) product_loader.add_value('image_url', product_image) price = hxs.select('//span[@id="product-price-' + product_identifier + '"]//text()').re(r'([\d.,]+)') price = price[0] if price else 0 product_loader.add_value('price', price) option_elements = [] dropdown_elements = hxs.select( '//select[contains(@class, "product-custom-options")]') for dropdown_options in dropdown_elements: options = [] for dropdown_option in dropdown_options.select( 'option[@value!=""]'): option = {} option['identifier'] = dropdown_option.select( '@value').extract()[0] option['desc'] = dropdown_option.select( './/text()').extract()[0].split('+')[0] option['price'] = dropdown_option.select( '@price').extract()[0] options.append(option) option_elements.append(options) final_options = [] if option_elements: combined_options = list(itertools.product(*option_elements)) for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + option['desc'] final_option['price'] = final_option.get( 'price', Decimal(0)) + extract_price( option['price']) final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option['identifier'] final_options.append(final_option) if final_options: for opt in final_options: opt_product = product_loader.load_item() opt_product['name'] += ' ' + normalize_space(opt['desc']) opt_product['price'] += opt['price'] opt_product['identifier'] += opt['identifier'] price = Decimal(opt_product['price']) net_price = price / Decimal('1.2') meta_ = Meta() meta_['net_price'] = str(net_price) opt_product['metadata'] = meta_ yield opt_product else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//h1[@class="name"]/text()').extract()[0] identifier = hxs.select('//meta[@itemprop="sku"]/@content').extract()[0] sku = hxs.select('//div[@class="detalleMarcaProducto2"]/strong[contains(text(), "Item model number:")]/following-sibling::text()[1]').extract() sku = sku[0] if sku else '' ean = hxs.select('//div[@class="detalleMarcaProducto2"]/strong[contains(text(), "EAN retail barcodes:")]/following-sibling::text()[1]').extract() ean = ean[0].strip() if ean else None brand = hxs.select('//*[@id="brandProduct"]/p/a/img/@alt').extract() brand = brand[0] if brand else '' image_url = hxs.select('//*[@id="zoom_01"]/@src').extract() category = hxs.select('//*[@id="wayProd"]//a/span/text()').extract()[-3:] price = hxs.select('//*[@id="total_dinamic"]/span/text()').extract()[0] price = extract_price(price) products = hxs.select('//*[@id="datesBuy"]//select[@name="talla_color"]/option') for product in products: product_loader = ProductLoader(item=Product(), selector=product) p_name = product.select('./text()').extract()[0] p_name = name if p_name == '- ' else name + ' ' + p_name p_identifier = product.select('./@value').extract()[0] product_loader.add_value('identifier', identifier + '_' + p_identifier) product_loader.add_value('name', p_name) product_loader.add_value('sku', sku) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = ZyroMeta() metadata['ean'] = ean product['metadata'] = metadata yield product
def parse_product(self, response): pdata = SpiderSchema(response).get_product() hxs = HtmlXPathSelector(response) url = response.url l = ProductLoader(item=Product(), response=response) name = pdata['name'] l.add_value('name', name) l.add_value('sku', pdata['sku']) l.add_value('category', SpiderSchema(response).get_category()) product_image = response.css( 'li.active a img::attr(src)').extract_first() if product_image: l.add_value('image_url', response.urljoin(product_image)) brand = response.css('.pdp-view-brand-main ::text').extract_first() l.add_value('url', url) l.add_value('price', pdata['offers']['properties']['price']) l.add_value('brand', response.meta.get('brand', brand)) identifier = response.xpath( '//form/input[@name="productId"]/@value').extract_first() if not identifier: self.log('No identifier found on %s' % response.url) return l.add_value('identifier', identifier) item = l.load_item() promotions = response.xpath( '//li[@class="pricesale"]/text()').extract() promotions += response.xpath( '//div[@class="special-offers"]/p/text()').extract() promotions = [x.strip() for x in promotions] promotions = u' * '.join(promotions) metadata = ToyMonitorMeta() ean = hxs.select('//li[contains(text(), "EAN")]/text()').re( "EAN: ([0-9]+)") if ean: metadata['ean'] = ean[0] metadata['reviews'] = [] item['metadata'] = metadata item['metadata']['promotions'] = promotions part_number = response.xpath( '//form/input[@name="partNumber"]/@value').extract_first() if pdata.get('aggregateRating'): review_url = ( "http://api.bazaarvoice.com/data/reviews.json?Callback=jQuery111206106209812916942_1465931826753" "&apiversion=5.4&passkey=q3mz09yipfffc2yhguids3abz&locale=en_GB&Filter=ProductId:%s" "&Filter=IsRatingsOnly:false&Include=Products&Stats=Reviews&Limit=100&Offset=0&Sort=SubmissionTime:Desc" "&_=1465931826756") % (part_number) req = Request(review_url, meta={ 'item': item, 'offset': 0 }, callback=self.parse_reviews) yield req else: yield item
def parse_products(self, response): hxs = HtmlXPathSelector(response) meta = response.meta products = hxs.select('//tr[@class="under_best_match"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'td/dl[@class="hproduct"]/dt/a/text()') sku = product.select('td/dl/dd[@class="reference_number"]/text()' ).extract()[0].strip() loader.add_value('sku', sku) loader.add_value('identifier', sku) url = urljoin_rfc( get_base_url(response), product.select( 'td/dl[@class="hproduct"]/dt/a/@href').extract()[0]) loader.add_value('url', url) loader.add_value('brand', meta['brand']) loader.add_value('category', meta['category']) image_url = product.select( 'td/dl/dd[@class="product_image"]/a/@style').extract() image_url = re.search('(\'.*\')', image_url[0]).group(1) if image_url else '' loader.add_value('image_url', image_url) loader.add_xpath( 'price', 'td[@class="price_bucket"]/ul/li[@class="total_price"]/text()') item = loader.load_item() price_was = product.select( 'td//li[@class="old_price"]/strong[contains(text(), "Was")]/text()' ).extract() price_was = ' '.join(price_was[0].split()) if price_was else '' metadata = JohnLewisMeta() metadata['promotion'] = price_was item = loader.load_item() item['metadata'] = metadata if item['price'] < 30: item['shipping_cost'] = 4.95 yield item next = hxs.select('//a[@title="Next"]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') partn = hxs.select(u'//span[@class="tsLabel" and contains(text(),"Manufacturer Part Number")]/following-sibling::span/text()').extract() if not partn: partn = hxs.select(u'//tr/td[contains(text(),"Manufacturer Part Number")]/following-sibling::td/text()').extract() partn = partn[0].strip() log.msg('PARTN: [%s == %s]' % (partn.lower(), response.meta['partn'].lower())) log.msg('SKU: [%s == %s]' % (partn.lower(), response.meta['sku'].lower())) sold_by = hxs.select(u'//div[contains(text(),"Sold by")]/b/text()').extract() sold_by = sold_by[0].strip() if sold_by else u'' log.msg(u'Sold by: %s' % sold_by) if (partn.lower() == response.meta['partn'].lower() or partn.lower() == response.meta['sku'].lower()) and sold_by != u'Towequipe': loader.add_value('sku', response.meta['partn']) loader.add_value('identifier', response.meta['partn'].lower()) # if loader.get_output_value('price'): yield loader.load_item() else: meta = response.meta next_result = meta['next_results'] if next_result: next_result = next_result[0] meta['next_results'] = meta['next_results'][1:] yield Request(next_result, callback=self.parse_product, meta=response.meta) elif meta.get('next_page'): next_page = meta['next_page'] yield Request(next_page, meta=response.meta)
def parse_car_details(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_name = hxs.select( '//h1/following-sibling::h2/text()').extract() product_price = hxs.select( './/td[contains(text(), "Cash Price")]//text()').re(r'[\d,.]+') product_img = hxs.select( '//source[@class="responsive-image"]/@data-placeholder').extract() if product_img: product_img = urljoin_rfc(base_url, product_img[-1]) loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', product_name) loader.add_value('name', product_name) loader.add_value('price', product_price) loader.add_value('url', response.url) loader.add_value('image_url', product_img) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) categories = response.xpath( '//li[div[contains(text(), "Audio, vision & technology")]]//a/@href' ).extract() for category in categories: yield Request(response.urljoin(category)) categories = response.xpath( '//div[@id="subCategorycategories"]/ul/li/a/@href').extract() categories += response.xpath( '//li[@id="categories"]/ul/li/a/@href').extract() categories += response.xpath( '//div[@class="cat_detail"]/div/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # products new parse method products = response.xpath('//div[contains(@id, "PSPProductList")]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = "".join( product.xpath( ".//div[contains(@class, 'product_name')]//text()"). extract()).strip() brand = product.xpath( 'div/a/div[@class="brand_name"]/text()').extract()[0].strip() url = product.xpath(".//a/@href").extract() url = urljoin_rfc(get_base_url(response), url[0]) sku = product.xpath(".//div[contains(@id, 'psp')]/@id").re( "psp_(.+)")[0] price = product.xpath(".//span[@class='price_now']/text()").re( u'Now\xa0\xa3(.*)') if not price: price = product.xpath( ".//span[@class='price-actual' and @itemprop='price']/text()" ).extract() if price: price = price[0] else: price = '' loader.add_value('stock', 0) category = response.xpath( '//div[@id="box_productSelectionPage"]/div/h1/text()').extract( ) category = category[0].strip() if category else '' loader.add_value('name', name) loader.add_value('brand', brand) # loader.add_value('category', category) loader.add_value('url', url) loader.add_xpath('image_url', 'div//img[@class="proImg"]/@src') loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('price', price) item = loader.load_item() metadata = DemoRMeta() metadata['reviews'] = [] metadata['promotion'] = ''.join( product.xpath( './/span[@class="discount_savings"]/text()').extract()) item = loader.load_item() item['metadata'] = metadata yield Request(item['url'], meta={'item': item}, callback=self.parse_product) for page in response.xpath( '//div[@id="pagination"]/a/@href').extract(): url = urljoin_rfc(get_base_url(response), page) yield Request(url)
def parse_product(self, response): identifier = response.xpath( '//form[@id="pdAddToCart"]//input[@name="product"]/@value' ).extract() if not identifier: return loader = ProductLoader(item=Product(), response=response) # Normalize URL product_url = url_query_cleaner(response.url, parameterlist=('content', 'product'), sep=';') loader.add_value('url', product_url) loader.add_value('identifier', identifier) sku = response.xpath( '//table[@class="table-bordered table-striped table-product-datasheet"]' '//td[text()="Item Code:"]/following-sibling::td[1]/text()' ).extract() if sku: loader.add_value('sku', sku[0]) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath( '//div[@class="box-price js-price"]/span[@itemprop="price"]/text()' ).extract() if price: price = extract_price(price[0].strip().replace('.', '').replace( ',', '.')) loader.add_value('price', price) else: loader.add_value('price', '0.0') image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) brand = response.xpath( '//table[@class="table-bordered table-striped table-product-datasheet"]' '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()' ).extract() if brand: loader.add_value('brand', brand[0]) category = response.xpath( '//ul[@class="nav"]//li[contains(@class,"item-active")]/a/text()' ).extract() if category: loader.add_value('category', category) availability = response.xpath( '//*[@id="js-availability-label"]/text()').extract() if availability and 'unknown' in availability[0].lower(): loader.add_value('stock', 0) product = loader.load_item() options = response.xpath( '//div[@class="input-group input-group-select"]/select') if not options: if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (product['identifier'] not in self.matched_identifiers)): if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product return for sel in options: opt = '' select_name = sel.xpath('@name').extract() if select_name: opt = select_name[0].replace('opt_', '') for option in sel.xpath('option[@value!="-2"]'): item = Product(product) opt_id = option.xpath('@value').extract() if opt_id: item['identifier'] += '-' + opt + '-' + opt_id[0] item['stock'] = 1 opt_stock = option.xpath('@data-av').extract() if opt_stock and opt_stock[0] == '100': item['stock'] = 0 opt_name = option.xpath('text()').extract() if opt_name: item['name'] += ' - ' + opt_name[0] if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (item['identifier'] not in self.matched_identifiers): continue if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item
def parse_node(self, response, selector): loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', selector.select('./id/text()').extract()) loader.add_value('name', selector.select('./name/text()').extract()) loader.add_value('price', selector.select('./price/text()').extract()) loader.add_value('category', selector.select('./category/text()').extract()) loader.add_value('sku', selector.select('./sku/text()').extract()) loader.add_value( 'url', selector.select('./url/text()').extract()[0].replace( 'http://', 'https://')) loader.add_value( 'image_url', selector.select('./imageurl/text()').extract()[0].replace( 'http://', 'https://')) loader.add_value('brand', selector.select('./brand/text()').extract()) return loader.load_item()
def parse(self, response): base_url = get_base_url(response) transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT)) password = "******" username = "******" transport.connect(username = username, password = password) sftp = paramiko.SFTPClient.from_transport(transport) files = sftp.listdir_attr() last = get_last_file("CRC_PRICEFEED_Germany", files) date_file = datetime.fromtimestamp(last.st_mtime) hours_diff = (datetime.now() - date_file).total_seconds() / 3600 # Check file updates if hours_diff >= 72: self.errors.append('WARNING: No Update for 3 days') zip_path = HERE+'/CRC_PRICEFEED_Germany.zip' xml_path = HERE+'/CRC_PRICEFEED_Germany.xml' sftp.get(last.filename, zip_path) unzip(zip_path, xml_path) xmlfeed_sku = '' with open(xml_path) as f: xmlfeed_sku = f.read() sku_prices = {} tree = et.fromstring(xmlfeed_sku) for item in tree.find('priceList[@id="GermanyRP"]').find('prices').findall('price'): sku = item.find('skuId').text price = item.find('listPrice').text sku_prices[sku] = price last = get_last_file("PriceMonitorHandler", files) zip_path = HERE+'/PriceMonitorHandler.zip' xml_path = HERE+'/PriceMonitorHandler.xml' sftp.get(last.filename, zip_path) unzip(zip_path, xml_path) xmlfeed_products = '' with open(xml_path) as f: xmlfeed_products = f.read() sku_products = {} tree = et.fromstring(xmlfeed_products) for item in tree.find('skus').findall('sku'): sku_products[item.find('skuID').text] = {'identifier':item.find('skuID').text, 'category': item.find('CategoryDescription').text, 'brand':item.find('BrandDescription').text, 'image_url': item.find('ImageURL').text, 'url': item.find('ProductURL').text, 'name': item.find('SkuDescription').text, 'sku': item.find('skuID').text, 'stock': item.find('SkuQuantity').text} for sku, price in sku_prices.iteritems(): try: product = sku_products[sku] except KeyError: log.msg('SKU not found:' + sku) continue product['price'] = price product = Product(product) loader = ProductLoader(response=response, item=product) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) # sku and identifier loader.add_xpath('identifier', "//div[@class='sku']//span[@class='value']//text()") loader.add_xpath('sku', "//div[@class='sku']//span[@class='value']//text()") # name name = ''.join(hxs.select("//h1[@itemprop='name']/text()").extract()) loader.add_value('name', name.strip()) #price price = extract_price( ''.join(hxs.select('//span[@itemprop="price"]/text()').extract())) loader.add_value('price', price) #stock stock = 1 if not price: stock = 0 loader.add_value('stock', stock) #image_url loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') #brand # loader.add_xpath('brand', "//div[@class='primary-logo']//img/@alt") loader.add_value('brand', 'Rituals') #category category = hxs.select('//*[@id="add-to-cart"]/@data-category').extract() category = category[0] if category else '' loader.add_value('category', category) #shipping_cost loader.add_value('shipping_cost', Decimal(0)) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) name = hxs.select(u'//h1/text()').extract()[-1].strip() price = hxs.select( u'//tr/td//font[starts-with(text(),"$")]/text()').extract() if price: price = price[0].split()[0] else: price = hxs.select(u'//tr/td[starts-with(text(),"Price:")]/text()' ).extract()[0].split('$')[-1] hxs = HtmlXPathSelector(response) category = hxs.select(u'//a[@class="linkHeading"]/text()').extract( )[1].split(' - ')[0].strip() # For some products name does not change by selecting different options name_selected = hxs.select( u'//tr/td/select/option[@selected]/text()').extract() if name_selected: try: name += name_selected[0][name_selected[0].index('~') + 1:].strip() except: #http://www.patrollersupply.com/equipment/item_703.asp only price try: name += name_selected[0][name_selected[0].index(' ') + 1:].strip() except: pass product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_xpath( 'sku', u'//tr/td[contains(text(),"SKU") or contains(text(),"Part #")]/../td[last()]/text()' ) product_loader.add_value('category', category) img = hxs.select( '//tr/td/img[contains(@src, "products")]/@src').extract()[0] img = urljoin_rfc(get_base_url(response), img) product_loader.add_value('image_url', img) product_loader.add_xpath( 'brand', u'//tr/td[contains(text(),"Manufacturer")]/../td[last()]/a/text()') product_loader.add_value('shipping_cost', '') yield product_loader.load_item() options = hxs.select(u'//tr/td/select/option/@value').extract() for opt in options: yield Request( 'http://www.patrollersupply.com/store/cart_item_review.asp?ID=' + opt, callback=self.parse_product)
def parse_previous_crawl(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: if row['identifier'] not in self.id_seen: self.id_seen.append(row['identifier']) loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['identifier'].decode('utf-8')) loader.add_value('sku', row['sku'].decode('utf-8')) loader.add_value('name', row['name'].decode('utf-8')) loader.add_value('price', row['price']) loader.add_value('url', row['url'].decode('utf-8')) loader.add_value('category', row['category'].decode('utf-8')) loader.add_value('brand', row['brand'].decode('utf-8')) loader.add_value('image_url', row['image_url'].decode('utf-8')) if row['stock']: loader.add_value('stock', int(row['stock'])) yield loader.load_item()
def parse(self, response): reader = csv.DictReader(StringIO(response.body)) for row in reader: loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['Luminox Rerence'].lower()) loader.add_value('sku', row['Luminox Rerence']) loader.add_value('brand', row['Brand']) loader.add_value('image_url', 'http://' + row['Image']) loader.add_value('name', row['Series name'].decode('utf8')) loader.add_value('price', row['SRP in USD']) yield loader.load_item()
def load_item_(self, item, browser=None, use_adurl=True): if browser: response = HtmlResponse(url=browser['webdriver'].current_url, body=browser['webdriver'].page_source, encoding='utf-8') else: response = HtmlResponse(url='http://www.google.co.uk/shopping', body='<html></html>', encoding='utf-8') l = ProductLoader(item=Product(), response=response) l.add_value('name', self._try_encoding(item['name'])) # Item URL url = self._try_encoding(item['url']) adurl = url_query_parameter(url, 'adurl') if adurl and use_adurl: item_url = adurl else: item_url = url l.add_value('url', item_url) l.add_value('price', item['price']) l.add_value('shipping_cost', item.get('shipping_cost', 0)) l.add_value('dealer', item.get('dealer', '')) l.add_value( 'identifier', browser['meta']['identifier'] if browser else item['identifier']) l.add_value('sku', browser['meta']['sku'] if browser else item['sku']) return l.load_item()
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(u'//div[@id="list_by_category"]//a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list, meta=response.meta) if not hxs.select(u'//div[@id="product_page"]'): return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h2[@id="longname"]/text()') if not product_loader.get_output_value('name'): product_loader.add_xpath('name', u'//h1/text()') product_loader.add_value('category', response.meta.get('category', 'spices')) img = hxs.select(u'//div[contains(@class,"image") and contains(@class,"db_content")]/img/@src').extract() if not img: img = hxs.select(u'//div[contains(@class,"image") and contains(@class,"db_content")]/a/@href').extract() if img: product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) #product_loader.add_xpath('brand', '') #product_loader.add_xpath('shipping_cost', '') product = product_loader.load_item() for opt in hxs.select(u'//div[@id="product_container"]//form'): prod = Product(product) prod['sku'] = opt.select(u'.//input[starts-with(@name,"m")]/@name').extract()[0] prod['identifier'] = opt.select(u'.//input[starts-with(@name,"m") and @type="text"]/@name').extract()[0] prod['name'] = prod['name'] + ' ' + opt.select(u'.//li[@class="product"]/text()').extract()[0].strip() prod['price'] = extract_price(opt.select(u'.//li[@class="price"]/text()').extract()[0]) yield prod
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brands = hxs.select( '//a[contains(@href, "brands/")]/span/text()').extract() loader = ProductLoader(response=response, item=Product()) loader.add_xpath('sku', '//input[@name="product"]/@value') loader.add_value('category', '') loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') brand = '' for b in brands: if b.upper().strip() in loader.get_output_value('name').upper(): brand = b break loader.add_value('brand', brand) img = hxs.select('//ul[@id="product-page-slider"]//img/@src').extract() img = urljoin_rfc(base_url, img[0]) if img else '' loader.add_value('image_url', img) loader.add_value('url', response.url) loader.add_xpath('identifier', '//input[@name="product"]/@value') item = loader.load_item() if not item.get('identifier', None): log.msg('Product without identifier, URL: ' + response.url) return data = re.search('Product.Config\((.*)\);', response.body) if data: data = data.groups()[0] data = json.loads(data) product_options = {} for attr in data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: product_options[product] = ' - '.join( (product_options.get(product, ''), option['label'])) for option_id, option_name in product_options.iteritems(): option_item = deepcopy(item) try: option_item['price'] = extract_price( data['childProducts'][option_id]['finalPrice']) except: option_item['price'] = extract_price( data['childProducts'][option_id]['price']) option_item['name'] = option_item['name'] + ' ' + option_name option_item[ 'identifier'] = option_item['identifier'] + '-' + option_id yield option_item else: item['price'] = extract_price(''.join( hxs.select( '//form//p[@class="special-price"]//span[@class="price"]/text()' ).extract())) if not item['price']: item['price'] = extract_price(''.join( hxs.select( '//div[@class="product-right"]//span[@class="price"]/text()' ).extract())) yield item