def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//ul[@id="search-results"]/li/span[@class="wrapper"]') for product in products: loader = ProductLoader(item=Product(), response=response) loader.add_value( 'name', product.select( './/span[@class="product-title"]/a/text()').extract()[0]) url = product.select( './/span[@class="product-title"]/a/@href').extract()[0] loader.add_value('url', url) try: loader.add_value( 'price', product.select('.//span[@class="product-ourprice"]/text()' ).extract()[0]) except IndexError: loader.add_value('price', 0) yield Request(url, callback=self.parse_product, meta={'loader': loader}) pages = hxs.select( '//div[contains(@class, "nav-pages")][1]//a/@href').extract() if pages: url = urljoin_rfc(get_base_url(response), pages[-1]) yield Request(url, callback=self.parse)
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="productListing"]//h5[a[contains(@class, "product-name")]]/..' ) for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/a/text()') url = product.select('.//a/@href').extract()[0] loader.add_value('url', url) price = product.select( '..//div[@class="product-buttons"]//span[@class="sellPrice"]/text()' ).extract() if not price: price = product.select( '..//div[@class="product-buttons"]//div[@class="productSpecialPrice"]/span/text()' ).extract() loader.add_value('price', price[0]) yield Request(url, callback=self.parse_product, meta={'loader': loader}) #loader.load_item() next = hxs.select('//a[@title=" Next Page "]/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url, callback=self.parse_products)
def parse_full(self, response): meta = response.meta.copy() meta['dont_redirect'] = True meta['dont_merge_cookies'] = True items_number = response.xpath( '//div[contains(@class, "pagination")]//span[contains(@class, "bold")]/text()' ).re(r'\d+') if items_number: if items_number[0] > items_number[1]: return need_retry = False brands = response.xpath('//dl[@class="brandsList"]//a/@href').extract() for brand in brands: yield (Request(brand, callback=self.parse_full)) cats = response.xpath( '//li[@data-selenium="category"]//@href').extract() if cats: for cat in cats: meta['try'] = 0 yield Request(url=canonicalize_url(cat), callback=self.parse_full, meta=meta, errback=lambda failure, url=canonicalize_url( cat), metadata=meta: self.bsm_retry_download( failure, url, metadata, self.parse_full)) products = response.xpath( '//div[contains(@class, "item") and contains(@class, "clearfix")]') if products: for product in products: try: brand = product.xpath( './/span[@itemprop="brand"]/text()').extract()[0] except IndexError: brand = '' try: title = product.xpath( './/span[@itemprop="name"]/text()').extract()[0] except IndexError: continue name = ' '.join((brand, title)) url = product.xpath('.//a[@itemprop="url"]/@href').extract()[0] price = ''.join( product.xpath('.//*[contains(@class, "price")]/text()'). extract()).strip() identifier = product.xpath( './/input[@name="sku"]/@value').extract() if identifier: identifier = identifier[0] id_part = product.xpath( './/input[@name="is"]/@value').extract() if id_part: identifier = identifier + '-' + id_part[0] else: self.log('No identifier found for %s on %s' % (name, response.url)) continue if not price: for data in response.xpath( '//div/@data-itemdata').extract(): json_data = json.loads(data) if json_data['sku'] in identifier.split('-'): price = json_data['price'] break sku = product.xpath( './/p[contains(@class, "skus")]//span[@class="sku"]/text()' ).extract() if sku: sku = sku[-1] else: sku = '' image_url = product.xpath( 'div/a[@name="image"]/img/@src').extract() if not image_url: image_url = product.xpath( 'div[@class="img-zone zone"]//img/@data-src').extract( ) if not image_url: image_url = product.xpath( 'div[@class="img-zone zone"]//img/@src').extract() if image_url: image_url = response.urljoin(image_url[0]) else: image_url = '' category = response.xpath('//ul[@id="breadcrumbs"]/li/a/text()' ).extract()[-1].strip() if category.lower() == "home": category = response.xpath( '//ul[@id="breadcrumbs"]/li[@class="last"]/text()' ).extract()[-1].strip() if identifier: if not price: price = '0.0' loader = AxeMusicProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('image_url', image_url) if brand: loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('name', name) loader.add_value('price', price) if url not in self.product_pages and loader.get_output_value( 'price') > 0: item = loader.load_item() if item['identifier'].endswith('-REG'): item['identifier'] = item['identifier'].replace( '-REG', '') yield item self.product_pages.add(url) elif not cats: need_retry = True pages = response.xpath( '//div[contains(@class, "pagination-zone")]//a/@href').extract() for page_url in pages: meta['try'] = 0 yield Request(callback=self.parse_full, url=canonicalize_url(page_url), meta=meta) if need_retry: retry = response.meta.get('try', 0) if retry < 15: meta = response.meta.copy() meta['try'] = retry + 1 self.log("Try %d. retrying to download %s" % (meta['try'], response.url)) yield Request(url=response.url, callback=self.parse_full, dont_filter=True, meta=meta)
def parse_product(self, response): meta = response.meta url = response.url price = '' for line in response.body.split('\n'): if "MAIN:No^Refrnce" in line: price = line.split('");')[0].split(', "')[-1] if not price: try: price = response.xpath( '//span[@itemprop="price"]/text()').extract()[0].replace( ',', '') except IndexError: pass identifier = meta.get('identifier') if not identifier: identifier = response.xpath( '//form[@name="addItemToCart"]//input[@name="sku"]/@value' ).extract() if not identifier: identifier = response.xpath( '//input[@name="useMainItemSku"]/@value').extract() id_part = response.xpath('//form/input[@name="is"]/@value').extract() if identifier: identifier = identifier[0] if id_part: identifier = identifier + '-' + id_part[0] else: self.log('Product without identifier: ' + response.url) return if not price: for data in response.xpath('//div/@data-itemdata').extract(): json_data = json.loads(data) if json_data['sku'] in identifier.split('-'): price = json_data['price'] break image_url = meta.get('image_url') if not image_url: image_url = response.xpath('//img[@id="mainImage"]/@src').extract() brand = meta.get('brand') if not brand: brand = response.xpath( '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt' ).extract() category = meta.get('category') if not category: try: category = response.xpath('//ul[@id="breadcrumbs"]/li/a/text()' ).extract()[-1].strip() except IndexError: pass sku = meta.get('sku') if not sku: sku = map( lambda s: s.replace(' ', '').lower(), response.xpath( '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content' ).re(r'mpn:([\w\s\.-]+)')) name = meta.get('name') if not name: name = ''.join( response.xpath( '//*[@itemprop="name"]//text()').extract()).strip() if identifier: loader = AxeMusicProductLoader(item=Product(), response=response) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) product = loader.load_item() # BSM simple run duplicates fix if isinstance(self, BigSiteMethodSpider) and self.simple_run and ( product['identifier'] not in self.matched_identifiers): self.matched_identifiers.add(product['identifier']) if product['price'] > 0: if product['identifier'].endswith('-REG'): product['identifier'] = product['identifier'].replace( '-REG', '') yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) redirected_urls = response.meta.get('redirect_urls', None) if redirected_urls: log.msg('Skips product, redirected url: ' + str(redirected_urls[0])) return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_value( 'identifier', re.search('p-(\d+)\.html', response.url).group(1)) name = hxs.select( u'//td[@class="pageHeading" and @valign="top" and not(@align)]/text()' ).extract()[0] product_loader.add_value('name', name) price = ''.join( hxs.select( u'//td[@class="pageHeading" and @valign="top" and @align="right"]/text()' ).extract()).strip() if not price: price = ''.join( hxs.select( u'//td[@class="pageHeading" and @valign="top" and @align="right"]/span[@class="productSpecialPrice"]/text()' ).extract()) product_loader.add_value('price', price) product_loader.add_xpath( 'sku', u'//td[@class="pageHeading" and @valign="top" and not(@align)]/span[@class="smallText"]/text()', re='\[(.*)\]') product_loader.add_value('category', response.meta.get('category')) image_url = hxs.select( u'//a[contains(@href,"images") and child::img]/@href').extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) product_loader.add_value('image_url', image_url) # product_loader.add_xpath('brand', u'') brand = '' brands = hxs.select( '//form[@name="manufacturers"]/select/option/text()').extract() for brand in brands: if '..' in brand: incomplete_brand = ' '.join(brand.split()[:-1]) if incomplete_brand.lower() in name.lower(): product_loader.add_value('brand', brand.replace('..', '')) else: if brand.lower() in name.lower(): product_loader.add_value('brand', brand.replace('..', '')) break yield product_loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) category = response.meta.get('category', '') for url in hxs.select('//div[@class="pages"]//a/@href').extract(): yield Request(url, callback=self.parse_products, meta=response.meta) products = hxs.select('//li[contains(@class, "item")]') for product in products: loader = ProductLoader(item=Product(), selector=product) try: model = map( unicode.strip, product.select('.//p[contains(text(), "model: ")]/text()'). re(r'model: (.*)'))[0] except: model = '' name = product.select( './/h2[@class="product-name"]/a/text()').extract() if name: name = name[0].strip() else: name = '' loader.add_value('name', ' '.join((name, model))) url = product.select( './/h2[@class="product-name"]/a/@href').extract()[0].strip() identifier = product.select( './/span[contains(@id, "product-price-")]/@id').re( r'product-price-(\d+)') if not identifier: identifier = product.select( './/ul[@class="add-to-links"]/li/a[@class="link-compare" or @class="link-wishlist"]/@href' ).re('product/(.*?)/') if identifier: prod_id = identifier[0] loader.add_value('identifier', prod_id) loader.add_value('url', url.split('?')[0]) try: brand = map( unicode.strip, product.select( './/p[contains(text(), "manufacturer: ")]/text()').re( r'manufacturer: (.*)'))[0] except: brand = product.select('td[3]//text()').extract() loader.add_value('brand', brand) if model: loader.add_value('sku', model) image_url = product.select( './/a[@class="product-image"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) try: price = product.select( './/span[contains(@id, "product-price-")]/span[@class="price"]/text()' ).extract()[0].strip() except: try: price = product.select( './/span[contains(@id, "product-price-") and contains(@class, "price")]/text()' ).extract()[0].strip() except: price = '0.0' loader.add_value('price', price) loader.add_value('category', category) if loader.get_collected_values( 'identifier') and loader.get_collected_values( 'identifier')[0]: product = loader.load_item() if product['price'] > 0: yield product else: self.log('IDENTIFIER NOT FOUND!!! {}'.format( loader.get_output_value('url')))
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta.copy() cats = hxs.select('//*[@id="tContent"]/div/div/div[@class="column"]' '/ul/li/a/@href').extract() pages = hxs.select( '//div[contains(@class, "pagination-zone")]//a/@href').extract() for page_url in pages: yield Request(callback=self.parse, url=canonicalize_url(page_url), errback=lambda failure, url=canonicalize_url( page_url), metadata=meta: self.retry_download( failure, url, metadata, self.parse)) products = hxs.select( '//div[contains(@class, "item") and contains(@class, "clearfix")]') if products: for product in products: try: brand = product.select( './/span[@itemprop="brand"]/text()').extract()[0] except IndexError: brand = '' title = product.select( './/span[@itemprop="name"]/text()').extract()[0] name = ' '.join((brand, title)) url = product.select( './/a[@itemprop="url"]/@href').extract()[0] identifier = product.select( './/input[@name="sku"]/@value').extract().pop() price = 0 for data in hxs.select('//div/@data-itemdata').extract(): json_data = json.loads(data) if json_data['sku'] == identifier: price = json_data['price'] break if not price: price = product.select( './/div[@class="price-zone"]/div[@class="atc-price"]' '//strong[contains(@class, "price")]/text()').extract( ) try: sku = product.select( './/p[contains(@data-selenium, "skus")]//span[@class="sku"]/text()' ).extract()[-1] except: sku = '' image_url = product.select( './/a[@class="itemImg"]/img/@data-src').extract( ) or product.select( './/a[@class="itemImg"]/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) else: image_url = '' category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()' ).extract()[-1].strip() if category.lower() == "home": category = hxs.select( '//ul[@id="breadcrumbs"]/li[@class="last"]/text()' ).extract()[-1].strip() bushnell_product = self.bushnell_products.get( sku.upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg( 'Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) if url not in self.urls_list: if price: self.urls_list.append(url) loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', url) loader.add_value('identifier', identifier) loader.add_value('sku', sku) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('name', name) loader.add_value('price', price) product = loader.load_item() yield self._get_reviews_url(product) else: # parse product page if price not found meta = { 'name': name, 'brand': brand, 'category': category, 'identifier': identifier, 'image_url': image_url, 'sku': sku } yield Request( url=url, callback=self.parse_product, meta=meta, errback=lambda failure, url=url, metadata=meta: self.retry_download( failure, url, metadata, self.parse_product)) elif not cats: retry = response.meta.get('try', 0) if retry < 15: meta = response.meta.copy() meta['try'] = retry + 1 yield Request( url=response.url, dont_filter=True, callback=self.parse, errback=lambda failure, url=response.url, metadata=meta: self.retry_download(failure, url, metadata, self.parse))
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) meta = response.meta url = response.url price = '' for line in hxs.extract().split('\n'): if "MAIN:No^Refrnce" in line: price = line.split('");')[0].split(', "')[-1] if not price: try: price = hxs.select( '//span[@itemprop="price"]/text()').extract()[0].replace( ',', '') except: pass identifier = meta.get('identifier') if not identifier: identifier = hxs.select( '//form[@name="addItemToCart"]//input[@name="sku"]/@value' ).extract()[0] image_url = meta.get('image_url') if not image_url: image_url = hxs.select('//img[@id="mainImage"]/@src').extract() brand = meta.get('brand') if not brand: brand = hxs.select( '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt' ).extract() category = meta.get('category') if not category: try: category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()' ).extract()[-1].strip() except: pass sku = meta.get('sku') if not sku: sku = hxs.select( '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content' ).re(r'mpn:(\w+)') if sku: bushnell_product = self.bushnell_products.get( sku[0].upper().strip(), None) if bushnell_product: category = bushnell_product['Class'] log.msg( 'Extracts category "%s" from bushnell file, URL: %s' % (category, response.url)) name = meta.get('name') if not name: name = ''.join( hxs.select( '//h1[@itemprop="name"]//text()').extract()).strip() if url not in self.urls_list: self.urls_list.append(url) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('category', category) loader.add_value('url', url) loader.add_value('sku', sku) loader.add_value('name', name) loader.add_value('price', price) product = loader.load_item() yield self._get_reviews_url(product)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) # Fill up the Product model fields # identifier = url = response.url brand = ''.join( response.xpath( '//span[@id="product-brand"]/text()').extract()).strip() name = ''.join( response.xpath( '//span[@id="product-header-name"]/text()').extract()).strip() full_name = brand + ' - ' + name # The price can be tagged in either <b> or <span>, or None price = response.xpath( '//span[@id="product-regular-price"]/text()').extract() if not price: price = response.xpath( '//span[@id="product-sale-price"]/text()').extract() if not price: price = 0 # Call for pricing sku = response.xpath('//h2[@id="product-model"]/text()').extract() identifier = response.xpath( '//span[@id="product-sku"]/text()').extract() category = response.xpath( '//div[@class="products-bredcrumbs"]/a/text()').extract() if len(category) > 1: category = category[1] else: category = "" image_url = response.xpath('//img[@id="product-image"]/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url.pop()) l = ProductLoader(response=response, item=Product()) l.add_value('url', url) l.add_value('name', name) l.add_value('price', price) l.add_value('sku', sku) l.add_value('identifier', identifier) l.add_value('category', category) if image_url: l.add_value('image_url', image_url) l.add_value('brand', brand) item = l.load_item() if item['identifier'] not in self.identifiers and item['price'] > 0: self.identifiers.add(item['identifier']) yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//div[@itemprop="name"]/text()') price = hxs.select(u'//span[@itemprop="price"]/text()').extract() price = price[0] if price else '0' product_loader.add_value('price', price) product_id = hxs.select( u'//form//input[@type="hidden" and @name="products_id"]/@value' ).extract() if not product_id: product_id = hxs.select( '//div[@id="productTellFriendLink"]/a/@href').re( 'products_id=(.*)') if not product_id: product_id = re.findall(r'products_id=(.*)" class', response.body) if not product_id: log.msg('Product without identifier: ' + response.url) return product_loader.add_value('identifier', product_id[0]) sku = hxs.select(u'//span[@itemprop="identifier"]/text()').extract() if sku: product_loader.add_value('sku', sku[0]) product_loader.add_xpath('category', u'//div[@id="navBreadCrumb"]/a[2]/text()') img = hxs.select(u'//div[@id="productMainImage"]//img/@src').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) brand = hxs.select('//li[@itemprop="brand"]/text()').extract() if brand: brand = brand[0].replace('Manufactured by: ', '') product_loader.add_value('brand', brand) product = product_loader.load_item() if product['price'] > 0: yield product """
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = AxeMusicProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()') price = hxs.select( u'//div[@class="price-box"]//span[@class="price"]/text()').extract( ) if price: price = price[0].strip() product_loader.add_value('price', price) else: return product_loader.add_xpath( 'sku', u'//div[@class="sku"]/span[@class="value"]/text()') product_loader.add_xpath( 'category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/span/text()') img = hxs.select('//img[@id="image-main"]/@src').extract() if img: img = urljoin_rfc(get_base_url(response), img[0]) product_loader.add_value('image_url', img) identifier = hxs.select('//meta[@itemprop="productID"]/@content').re( 'sku:(.*)')[0] product_loader.add_value('identifier', identifier) product_loader.add_value( 'brand', self._get_brand_from_name(product_loader.get_output_value('name'))) #stock_status = ''.join(hxs.select('//p[@class="availability in-stock"]/h10/text()').extract()).strip() # if stock_status: # if 'OUT OF STOCK' in stock_status.upper(): # product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse(self, response): if ('temporarilyUnavailable' in response.url) and ('redirect_urls' in response.meta) and ( response.meta['redirect_urls']): url = response.meta['redirect_urls'][0] yield Request(url, dont_filter=True) return urls = response.xpath( '//nav[@id="category-navigation"]//a/@href').extract() for url in map(lambda u: response.urljoin(u), urls): yield Request(url) urls_all = response.xpath( '//span[@class="class-type-more"]/a/@href').extract() for url in map(lambda u: response.urljoin(u), urls_all): yield Request(url) if not urls_all: pages = set( response.xpath( '//div[@class="pagination-container"]//li[@class="next"]/a/@href' ).extract()) for url in map(lambda u: response.urljoin(u), pages): yield Request(url) products = response.xpath( '//div[@class="products"]/div[@class="product" and div[contains(@class, "cart")]]' ) for product in products: product_name = ' '.join( product.xpath( './/h3[contains(@class, "name")]//text()').extract()) product_url = map( lambda u: response.urljoin(u), product.xpath('.//h3[contains(@class, "name")]/a/@href'). extract())[0] product_price = product.xpath( './/div[@class="product-price"]/span[@class="price"]/text()' ).extract()[0] product_image = map( lambda u: response.urljoin(u), product.xpath( './/div[@class="product-image"]//img/@data-echo'). extract())[0] product_sku = filter(lambda l: l.strip(), product_url.split('/'))[-1] product_brand = product.xpath( './/h3[contains(@class, "name")]//text()').extract()[0] product_identifier = '%s-%s' % (product_brand.strip(), product_sku.strip()) product_category = response.xpath( '//div[@class="breadcrumb-inner"]//li//span[@itemprop="title"]/text()' ).extract() loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', product_name) loader.add_value('url', product_url) loader.add_value('price', product_price) loader.add_value('image_url', product_image) loader.add_value('sku', product_sku) loader.add_value('brand', product_brand) loader.add_value('identifier', product_identifier) loader.add_value('category', product_category) item = loader.load_item() if item['identifier'] not in self._identifier_name: self._identifier_name[item['identifier']] = item['name'] else: item['name'] = self._identifier_name[item['identifier']] yield item if not products: urls = response.xpath('//h3[@class="name"]/a/@href').extract() for url in map(lambda u: response.urljoin(u), urls): yield Request(url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//span[@itemprop="name"]/text()') price = hxs.select(u'//form[@id="vCSS_mainform"]//span[@itemprop="price"]/text()').extract() price = price[0] if price else u'0' product_loader.add_value('price', price) product_loader.add_xpath('sku', u'//span[@class="product_code"]/text()') product_loader.add_xpath('identifier', u'//span[@class="product_code"]/text()') product_loader.add_xpath('category', u'//td[@class="vCSS_breadcrumb_td"]//a[position()=2]/@title') product_loader.add_xpath('image_url', u'concat("http://lamusic.ca",//img[@id="product_photo"]/@src)') product_loader.add_xpath('brand', u'//meta[@itemprop="manufacturer"]/@content') availability_label = ''.join(hxs.select('//b[contains(text(), "Availability:")]/text()').extract()).strip() # in_stock = 'IN STOCK' in ''.join(hxs.select('//div[@itemprop="offers"]/text()').extract()).strip().upper() # if availability_label and not in_stock: # product_loader.add_value('stock', 0) if hxs.select(u'//img[@class="vCSS_img_icon_free_shipping"]'): product_loader.add_value('shipping_cost', '0') product = product_loader.load_item() if hxs.select(u'//tr[@class="Multi-Child_Background"]'): for opt in hxs.select(u'//tr[@class="Multi-Child_Background"]'): p = Product(product) p['sku'] = opt.select(u'./td[1]/text()').extract()[0].strip() p['identifier'] = opt.select(u'./td[1]/text()').extract()[0].strip() p['name'] = opt.select(u'./td[2]/text()').extract()[0].strip() try: p['price'] = opt.select(u'./td[4]//span[@itemprop="price"]/text()').extract()[0].strip().replace('$', '').replace(',', '') except: price = opt.select(u'./td[4]//span/text()').extract() if not price: price = opt.select(u'./td[3]//span[contains(text(), "$")]/text()').extract() p['price'] = price[0].strip().replace('$', '').replace(',', '') if p.get('identifier') and p.get('price') > 0: yield p elif product.get('identifier') and product.get('price') > 0: yield product
def parse_products(self, response): hxs = HtmlXPathSelector(response) brands = map( strip, hxs.select( '//*[@id="facets"]//div[contains(label/text(), "Search Brands")]/ul/li/a/text()' ).re(r'(.*) \(\d+\)')) products = hxs.select( '//div[@class="productGrid"]//div[@class="product"]') for product in products: try: name = product.select( './/div/strong/a/text()').extract()[0].strip() except: continue image_url = product.select( './/div[@class="thumb "]/span/img/@data-original').extract() category = hxs.select( '//ol[@class="breadcrumbs"]/li/a/text()').extract()[-2] brand = filter(lambda b: b in name, brands) url = urljoin_rfc( get_base_url(response), product.select('.//div/strong/a/@href').extract()[0].strip()) price = ' '.join(''.join( product.select('div/span[@class="productPrice"]/text()'). extract()).split()) if not price: price = ' '.join(''.join( product.select( 'div/dl[@class="productUsedPrice"]//dd/text()'). extract()).split()) sku = product.select( 'var[contains(@class, "productId")]/text()').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('sku', sku) loader.add_value('category', category) if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) loader.add_value('brand', brand) loader.add_value('identifier', sku) if 'Email for Price' in price: log.msg('Email for price') loader.add_value('price', 0) metadata = AxeMeta() metadata['price'] = 'Email for Price' prod = loader.load_item() prod['metadata'] = metadata else: loader.add_value('price', price) prod = loader.load_item() yield Request(url, callback=self.parse_product, meta={'product': prod}) next_page = hxs.select('//a[@class="next_link"]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) meta_url = hxs.select( '//meta[@property="og:url"]/@content').extract()[0] parsed = urlparse.urlparse(meta_url) params = urlparse.parse_qs(parsed.query) name = hxs.select('//td[@class="text11 bold"]//h1/text()').extract() sku = hxs.select('//div[@class="grey text12"]/text()').re( r'Model: ([\w-]+)') price = hxs.select( '//table[@class="bold text11"]//tr[@class="bold darkBlue"]/td[2]/text()' ).extract() category = hxs.select('//div[@id="breadcrums"]/a[1]/text()').extract() img_url = hxs.select('//img[@id="itemImage"]/@src').extract()[0] base_url = get_base_url(response) img_url = urljoin_rfc(base_url, img_url) brand = hxs.select( '//div[@class="grey text12"]/following-sibling::img[1]/@src' ).extract() if (brand): brand = brand[0] brand = re.search('([\w]+)\.+', brand).group(1) if not price: # If product has sub-products prod_list = hxs.select( '//div[@class="grey text12"]/following-sibling::table[1]//select/option/@value' ).extract() for prod in prod_list: item_id = str('itemID=' + prod) url = re.sub('itemID=([\d]+)', item_id, response.url) yield Request(url, callback=self.parse_product) loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) if price: loader.add_value('price', price) else: loader.add_value('price', [u'$0.0']) loader.add_value('identifier', params['itemID']) loader.add_value('url', response.url) loader.add_value('sku', sku) loader.add_value('category', category) loader.add_value('image_url', img_url) if (brand): loader.add_value('brand', brand) # Not Found - Shipping cost yield loader.load_item()