def parse(self, response): # Main categories for cat_url in response.xpath( '//ul[@id="main-nav"]/li/a/@href').extract(): yield Request(response.urljoin(cat_url)) sub_categories = response.xpath( '//div[contains(@class, "sub-categories")]' '/div/div//p/a/@href').extract() for sub_cat in sub_categories: yield Request( add_or_replace_parameter(response.urljoin(sub_cat), 'sort', 'lowest')) categories = response.xpath( '//ul[@class="category"]/li/a/@href').extract() categories += response.xpath( '//a[contains(@class, "shop-all-button")]/@href').extract() categories += response.css('.subcat-panel ::attr(href)').extract() for url in categories: yield Request( add_or_replace_parameter(response.urljoin(url), 'sort', 'lowest')) next_page = response.xpath( '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract() if next_page: yield Request(url=response.urljoin(next_page[0])) products = response.xpath('//div[contains(@class, "product")]') for product_xs in products: url = product_xs.xpath('a/@href').extract() if not url: continue product_loader = ProductLoader(item=Product(), selector=product_xs) product_loader.add_value('url', url) try: sku = product_xs.xpath('p[@class="product-sku"]/text()').re( 'KaTom #: (.*)')[0] except: sku = None product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) product_loader.add_xpath('name', 'a/@title') product_loader.add_css('image_url', '.img ::attr(src)') product_loader.add_xpath('category', '//h1[@class="title"]/text()') product = product_loader.load_item() if len(product.get('sku', '').split('-')) > 1: product['sku'] = '-'.join(product['sku'].split('-')[1:]) yield Request(url=product_loader.get_output_value('url'), meta={"product": product}, callback=self.parse_product)
def parse_product(self, response): itemno = response.xpath( '//div[@id="product-main-info"]//a[contains(@id, ' '"wishlist_link_")]/@id').re(r'(\d+)') if not itemno: self.log('ERROR: itemno not found => %s' % response.url) return else: itemno = itemno[0] price = ''.join( response.xpath('//span[@id="the-price"]//text()').re(r'[\d\.,]+') [-2:]) if not price: self.log('WARNING: price not found => %s' % response.url) price = '0.00' sku = response.xpath('//li[@itemprop="sku"]/text()').extract() if not sku: self.log('WARNING: SKU not found => %s' % response.url) else: sku = sku[0].replace('Model #:', '').strip() brand = response.xpath('//li[@itemprop="name"]/text()').extract() image_url = response.xpath( '//div[@id="zoom-div"]//img[@itemprop="image"]/@src').extract() category = response.xpath('//span[@class="breadcrumb-element"]' '//*[@itemprop="name"]/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') loader.add_value('price', price) if sku: loader.add_value('sku', sku) if image_url: loader.add_value('image_url', image_url) if brand: loader.add_value('brand', brand) loader.add_value('identifier', itemno + ' ' + sku) if category: loader.add_value('category', category[0].strip()) product = loader.load_item() sold_as = response.xpath('//li[contains(text(),"Sold As:")]/../li[2]/text()')\ .extract()[0].strip() metadata = TigerChefMeta() metadata['sold_as'] = sold_as product['metadata'] = metadata yield product
def parse_product(self, response): product = response.meta['product'] product_loader = ProductLoader(Product(product), response=response) product_loader.add_xpath( 'price', '//meta[@property="og:price:amount"]/@content') product_loader.add_value('price', 0) name = response.xpath( '//div[@class="product-info"]/p[@class="h1"]/text()').extract() img_url = response.xpath('//img[@class="mainImgFix"]/@src').extract() if not img_url: self.log("ERROR img not found") else: product_loader.add_value('image_url', img_url[0]) category = response.xpath( '//ol[contains(@class, "breadcrumb")]/li/a/text()').extract() if not category: self.log("ERROR category not found") else: product_loader.add_value('category', category[-1]) brand = response.xpath('//div[@class="logo-area"]/a/@title').extract() if not brand: brand = response.xpath( '//td[contains(text(), "Manufacturer")]/following-sibling::td/text()' ).extract() if not brand: self.log("ERROR brand not found") else: product_loader.add_value('brand', brand[0]) product = product_loader.load_item() if name: product['name'] = name[0].strip() sold_as = response.xpath( '//strong[@class="price"]/span/text()').extract() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split('/ ')[-1] if sold_as else '1 ea' product['metadata'] = metadata yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select('//div[contains(@class, "productResult")]') log.msg(">>>>>>>> FOUND %s ITEMS >>>" % len(products)) for product in products: product_loader = TigerChefLoader(Product(), product, spider_name=self.name) product_loader.add_xpath( 'name', './/h2[@class="productResultName"]/a/text()') try: name = product.select('.//h2[@class="productResultName"]/a/text()').extract()[0] except: self.log('Cannot find name %s' % response.url) url = product.select( './/h2[@class="productResultName"]/a/@href' ).extract()[0] url = canonicalize_url(urljoin_rfc(base_url, url)) price = ' '.join(product.select( './/span[@class="variantprice"]//text()').extract()) identifier = identifier_regex.search(url).group(1) yield Request(url, callback=self.parse_product, meta={'name': name, 'price': price, 'identifier': identifier}) products2 = hxs.select('//div[contains(@id, "ageContent_pnlContent")]/table/tr/td/table/tr[2]/td/a/@href').extract() for url in products2: identifier = identifier_regex.search(url).group(1) yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta={'identifier': identifier}) if not products and not products2 and not hxs.select('//td[@id="featuredProductsTable"]'): retry = int(response.meta.get('retry', 0)) if retry < 10: self.log('WARNING: No products and no subcategories, Retry => %s' % response.url) retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request( response.url, meta=new_meta, cookies={'pagesize': 10000}, callback=self.parse_products, dont_filter=True) else: self.log('ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}'.format(response.url))
def parse(self, response): data = response.xpath('//script/text()').re("products', (\[{.+}\])") if not data: return list_of_data = json.loads(data[0]) for data in list_of_data: loader = ProductLoader(item=Product(), response=response, spider_name=self.name) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('sku', data['sku']) loader.add_value('identifier', str(data['sqlProductID']) + '_' + data['sku']) loader.add_value('name', data['name']) loader.add_value('price', data['price']) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[-1]) loader.add_value('brand', data['manufacturer']) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content') loader.add_value('stock', int(data['inventoryStatus'] != 3)) yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) meta = response.meta products = hxs.select('//li[contains(@itemtype, "Product")]') for product in products: product_loader = ProductLoader(Product(), product, spider_name=self.name) product_loader.add_xpath('name', './/a[@itemprop="name"]/text()') product_loader.add_xpath('url', './/a[@itemprop="name"]/@href') product_loader.add_xpath('price', './/span[@itemprop="price"]/text()') product_loader.add_xpath('image_url', 'div/a/img/@src') identifier = product.select('@id').extract()[0].split( 'product_')[-1] product_loader.add_value('identifier', identifier) product_loader.add_value('category', meta.get('category')) product_loader.add_value('brand', meta.get('brand')) sku = product.select('.//span[@itemprop="model"]/text()') if sku: sku = sku.extract()[0] ''' dash_pos = sku.find('-') if dash_pos >= 0: sku = sku[dash_pos + 1:] ''' product_loader.add_value('sku', sku) sold_as = product.select( 'div/div/div/div/span[contains(text(), "Sold As")]/text()' ).extract() product = product_loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].split( 'Sold As: ')[-1].strip() if sold_as else '1 ea' product['metadata'] = metadata yield product next_page = hxs.select( '//td[@class="next"]/a[@class="pagerlink"]/@href').extract() if next_page: yield Request(urljoin_rfc(base_url, next_page[0]), callback=self.parse_products, meta=meta)
def parse(self, response): next_page = response.xpath( '//*[@class="pagelinks"]/following-sibling::td//a[contains(text(), "Next")]/@href' ).extract() if next_page: yield Request(response.urljoin(next_page[0]), meta={'dont_merge_cookies': True}, dont_filter=True) if not next_page: self._search_done = True products_xs = response.xpath('//td[contains(@class, "search-prod")]') for product_xs in products_xs: sku = None product_id = product_xs.xpath( './/*[@class="search-item-title"]/a/@href').extract()[0].split( '/')[-1].split('.')[2] try: brand, sku = product_xs.xpath( './/*[@class="search-item-title"]/following-sibling::div/a/text()' ).extract() except ValueError: try: brand = product_xs.xpath( './/*[@class="search-item-title"]/following-sibling::div/a/text()' ).extract()[0] except: brand = None image_url = map(response.urljoin, product_xs.xpath('.//img/@src').extract()) price = product_xs.xpath('.//*[@class="search-item-price"]').re( r'[\d\.,]+') add_to_cart = bool( product_xs.xpath( './/*[@class="search-item-price"]/span[@class="see-price-sprite"]' )) loader = ProductLoader(item=Product(), selector=product_xs) identifier = product_id if sku: identifier = identifier + ' ' + sku.lower() loader.add_value('identifier', identifier) if sku: loader.add_value('sku', sku) loader.add_xpath('url', './/*[@class="search-item-title"]/a/@href') if image_url: loader.add_value( 'image_url', image_url[0].replace('/pics/sm/', '/pics/md/').replace('sm_', 'md_')) if brand: loader.add_value('brand', brand) loader.add_xpath( 'name', './/*[@class="search-item-title"]/a/strong/text()') if price: loader.add_value('price', price[0]) yield loader.load_item() elif add_to_cart: product = loader.load_item() url = response.urljoin( product_xs.xpath( './/a[@class="atc-primary"]/@href').extract()[0]) item_id = url_query_parameter(url, 'ItemID') self._add_to_cart_products.append((item_id, url, product))
def parse_product(self, response): schema = SpiderSchema(response) data = schema.get_product() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', data['Name']) loader.add_xpath('category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/text()') price = response.xpath( '//form[@id="productform"]/input[@name="price"]/@value').extract() if price: loader.add_value('price', price[0]) else: loader.add_value( 'price', data.get('offers', {}).get('properties', {}).get('price', '0.0')) sku = map( unicode.strip, response.xpath( '//span[contains(@class, "mfr-number")]/text()').extract()) loader.add_value('identifier', data['productID']) if sku: loader.add_value('sku', sku) else: loader.add_value('sku', data['productID'].replace('#', '')) image_url = data.get('image', '').replace('www.example.com', 'www.webstaurantstore.com') if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = data.get('brand', '') if not brand: brand = response.xpath( '//tr[@class="highlight" and .//b[contains(text(), "Manufacturer Name")]]/td[not(b)]/text()' ).extract() brand = brand[0].strip() if brand else '' if brand: loader.add_value('brand', brand) sold_as = response.xpath( '//div[@id="subject"]/div/div/p/span[@class="each"]/text()' ).extract() product = loader.load_item() if product.get('identifier', '').strip() != '': metadata = TigerChefMeta() metadata['sold_as'] = sold_as[0].replace('/', '') if sold_as else '' product['metadata'] = metadata # Add to cart to see the price if response.xpath( '//*[@itemprop="price" and contains(@class, "strikeOutPrice")][1]' ): cart_url = 'http://www.webstaurantstore.com/viewcart.html' inputs = response.xpath('//form[@id="productform"]/input') formdata = dict( zip( inputs.select('./@name').extract(), inputs.select('./@value').extract())) # quantity formdata[u'qty'] = '1' f_request = FormRequest(url=cart_url, method='POST', formdata=formdata, callback=self.parse_price, meta={ 'product': product, 'dont_merge_cookies': True }, dont_filter=True) yield f_request else: yield product # loader.load_item()
def parse_product(self, response): # self.log("parse_product") hxs = HtmlXPathSelector(response) name = hxs.select('//h1[@id="partNameId"]/text()').extract() quantity = hxs.select( '//label[@class="productdetail-qtytxt"]/../text()[last()]' ).extract() if quantity: quantity = quantity[0].replace('\n', ' ').replace('\r', ' ').replace( '\t', ' ').strip() quantity = re.sub(' +', ' ', quantity) loader = ProductLoader(response=response, item=Product(), spider_name=self.name) if not name: self.log("ERROR name not found") else: loader.add_value('name', name[0].strip()) brand = hxs.select( '//div[@class="productdetail-contentarea-wrapper"]/table/tr/td[.//b[contains(text(),"Manufacturer:")]]/a/text()' ).extract() if not brand: self.log("ERROR brand not found") else: loader.add_value("brand", brand[0].strip()) img_url = hxs.select( '//div[@class="productdetail-productimage"]/a/img/@src').extract() if not img_url: self.log("ERROR img_url not found") else: loader.add_value("image_url", img_url[0]) category = hxs.select( '(//div[@id="productdetail-crumbcategory"]/ul/li/a)[last()]/text()' ).extract() if not category: self.log("ERROR category not found") else: loader.add_value("category", category[0].strip()) # self.log("name = " + name[0].strip() + ", quantity = " + quantity.strip()) if quantity and quantity.lower() != 'each': loader.add_value('name', quantity) loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join( hxs.select('//b[contains(text(), "Model #:")]/../text()').extract( )).strip() temp = sku.split() if len(temp) == 2 and temp[0] == temp[1]: sku = temp[0] loader.add_value('sku', sku) loader.add_xpath('identifier', '//form//input[@name="productId"]/@value') product = loader.load_item() metadata = TigerChefMeta() metadata['sold_as'] = quantity if quantity else '1 ea' product['metadata'] = metadata yield product