def parse(self, response): data = json.loads(response.body) i = 0 #Extract the mpns of the first product. mpns = data['items'][0]['product'].get('mpns', [''])[0] if mpns: #Search for the lowest price for the products with the same mpns lowest = None data_mpns = { 'items': [ item for item in data['items'] if item['product'].get( 'mpns', [''])[0].lower() == mpns.lower() ] } while True: res = self._get_item(data_mpns, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([ self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS ]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']) and \ (lowest is None or lowest['price'] > pr['price']): lowest = pr i += 1 if lowest: yield lowest else: #Search for the first product with a valid price range. first_valid = None while True: res = self._get_item(data, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([ self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS ]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']): first_valid = pr break i += 1 if first_valid: yield first_valid
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()') # if not accept_product(loader.get_output_value('name')): # continue loader.add_xpath("url", './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href') loader.add_xpath("price", './/*[@class="newPrice"]//span/text()') loader.add_value("sku", response.meta["sku"]) loader.add_value("identifier", response.meta["sku"]) # loader.add_value('sku', response.meta['sku']) # loader.add_value('identifier', response.meta['sku']) if ( loader.get_output_value("price") and (pr is None or pr.get_output_value("price") > loader.get_output_value("price")) and valid_price(response.meta["price"], loader.get_output_value("price")) ): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) search_results = [] products = hxs.select(u'//div[@class="resultBox"]') for product in products: loader = ProductLoader(item=Product(), selector=product) url = product.select(u'./h2/a/@href')[0].extract() url = urljoin_rfc(get_base_url(response), url) loader.add_value('url', url) name = product.select(u'./h2/a/text()')[0].extract().strip() loader.add_value('name', name) loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) price = product.select( u'./ul/li[@class="price"]/h3[@class="mainPrice"]/text()' )[0].extract().replace(',', '.') loader.add_value('price', price) if valid_price(response.meta['price'], loader.get_output_value('price')): search_results.append(loader) search_results.sort(key=lambda x: x.get_output_value('price')) search_q = response.meta['model'].lower().split(' ') for result in search_results: name = result.get_output_value('name') if all([x in name.lower() for x in search_q]): yield result.load_item() return if search_results: yield search_results[0].load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//td[@r="1"]') if not product: product = hxs.select('//table[@r="1"]') if not product and response.meta.get('_retries', 0) >= 3: #log.msg('ALERT! ' + response.url) #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w') #f.write(response.body) #f.close() return elif not product: retries = response.meta.get('_retries', 0) yield Request(response.url, meta={'sku': response.meta['sku'], '_retries': retries + 1}, dont_filter=True) return loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()') loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href') loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()') loader.add_xpath('price', './/div[@class="prices"]//span[@class="g-b amt"]/text()') loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()') loader.add_xpath('price', './/*[@itemprop="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower() \ and valid_price(response.meta['price'], loader.get_output_value('price')): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value( 'name', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('span')[0].string) loader.add_value( 'url', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('a')[0]['href']) loader.add_value( 'price', soup.find('ul', attrs={ 'class': 'rsltL' }).findAll('span')[0].string) #loader.add_value('sku', response.meta['sku']) #loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath( 'name', './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()' ) #if not accept_product(loader.get_output_value('name')): # continue loader.add_xpath( 'url', './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href' ) loader.add_xpath('price', './/*[@class="newPrice"]//span/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) #loader.add_value('sku', response.meta['sku']) #loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): data = json.loads(response.body) i = 0 #Extract the mpns of the first product. mpns = data['items'][0]['product'].get('mpns',[''])[0] if mpns: #Search for the lowest price for the products with the same mpns lowest = None data_mpns = {'items': [item for item in data['items'] if item['product'].get('mpns',[''])[0]==mpns]} while True: res = self._get_item(data_mpns, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']) and \ (lowest is None or lowest['price'] > pr['price']): lowest = pr i += 1 if lowest: yield lowest else: #Search for the first product with a valid price range. while True: res = self._get_item(data, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']): first_valid = pr break i += 1 if first_valid: yield first_valid
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') products += hxs.select('//div[@id="btfResults"]//div[starts-with(@id, "result_")]') pr = None search_results = [] for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select('.//*[contains(@class, "Title") or contains(@class, "title")]//a/text()').extract() if not name: name = product.select('h3[@class="newaps"]/a/span/text()').extract() loader.add_value('name', name) url = product.select('.//*[contains(@class, "Title") or contains(@class, "title")]//a/@href').extract() if not url: url = product.select('h3[@class="newaps"]/a/@href').extract() loader.add_value('url', url) price = product.select('.//*[@class="newPrice"]//span[contains(@class,"price")]/text()').extract() if not price: price = product.select('.//div[@class="usedNewPrice"]//span[@class="price"]/text()').extract() if not price: price = product.select('.//div[@class="usedPrice"]//span//text()').extract() if not price: price = product.select('.//ul[@class="rsltGridList"]/li[1]/a/span[@class="bld lrg red"]//text()').extract() if not price: price = product.select('.//ul[@class="rsltGridList grey"]/li[1]/a/span[@class="price bld"]//text()').extract() if not price: price = product.select('.//ul[@class="rsltGridList grey"]/li[1]/a/span[@class="bld lrg red"]//text()').extract() if not price: price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()').extract() if not price: price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="price bld"]//text()').extract() if not price: price = product.select('.//ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()').extract() print price if price: loader.add_value('price', price[0].replace(',', '.')) else: self.log("No price found") continue loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) pr = loader if valid_price(response.meta['price'], pr.get_output_value('price')): search_results.append(pr) search_results.sort(key=lambda x: x.get_output_value('price')) if search_results: cur_prod = search_results[0] next_prods = search_results[1:] yield cur_prod.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None i = 0 for product in products: i += 1 product_loader = ProductLoader(item=Product(), selector=product) name = product.select( './/h3[@class="newaps"]/a/span/text()').extract() if not name: if i == 1: self.log("ERROR name not found") continue product_loader.add_value('name', name[0]) price = product.select( './/ul[@class="rsltL"]//span[1]/text()').extract() if not price: price = product.select( './/ul[contains(@class,"rsltGridList grey")]//span[1]/text()' ).extract() if not price: self.log("ERROR price not found2") continue product_loader.add_value('price', price[0]) url = product.select('.//h3[@class="newaps"]/a/@href').extract() if not url: self.log("ERROR url not found") else: product_loader.add_value('url', url[0]) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('identifier', response.meta['sku']) #self.log("price: " + str(product_loader.get_output_value('price')) + ", price_meta: " + str(response.meta['price']) + ", url: " + response.url) if product_loader.get_output_value('price') and \ (pr is None or pr.get_output_value('price') > product_loader.get_output_value('price')) and \ valid_price(response.meta['price'], product_loader.get_output_value('price')): pr = product_loader if pr: yield pr.load_item()
def parse(self, response): data = json.loads(response.body) i = 0 while True: res = self._get_item(data, i, response) if not res: return pr = res[0] item = res[1] if "apparelsave" in item["product"]["author"]["name"].lower() or "shoemetro.com" in pr["url"]: i += 1 else: if valid_price(response.meta["price"], pr["price"]): yield pr return else: i += 1
def parse(self, response): data = json.loads(response.body) i = 0 lowest = None while True: res = self._get_item(data, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']) and \ (lowest is None or lowest['price'] > pr['price']): lowest = pr i += 1 if lowest: yield lowest
def parse(self, response): data = json.loads(response.body) i = 0 while True: res = self._get_item(data, i, response) if not res: return pr = res[0] item = res[1] if 'apparelsave' in item['product']['author']['name'].lower() or \ 'shoemetro.com' in pr['url']: i += 1 else: if valid_price(response.meta['price'], pr['price']): yield pr return else: i += 1
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string) loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href']) loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string) loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price'): if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): if valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//td[@r="1"]') if not product: product = hxs.select('//table[@r="1"]') if not product and response.meta.get('_retries', 0) >= 3: #log.msg('ALERT! ' + response.url) #f = open(os.path.join(HERE, response.meta['sku'] + '.html'), 'w') #f.write(response.body) #f.close() return elif not product: retries = response.meta.get('_retries', 0) yield Request(response.url, meta={ 'sku': response.meta['sku'], '_retries': retries + 1 }, dont_filter=True) return loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/div[@class="ittl"]//a[@class="vip"]/text()') loader.add_xpath('url', './/div[@class="ittl"]//a[@class="vip"]/@href') loader.add_xpath('price', './/div[@class="prices"]//span[@class="amt"]/text()') loader.add_xpath( 'price', './/div[@class="prices"]//span[@class="g-b amt"]/text()') loader.add_xpath('price', './/td[@class="prc"]//div[@class="g-b"]/text()') loader.add_xpath('price', './/*[@itemprop="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if not 'apparelsave' in loader.get_output_value('name').lower() \ and valid_price(response.meta['price'], loader.get_output_value('price')): yield loader.load_item()
def parse(self, response): data = json.loads(response.body) i = 0 lowest = None while True: res = self._get_item(data, i, response) if not res: break pr = res[0] item = res[1] invalid_domain = any([ self._check_domain(domain, pr['url']) for domain in FILTER_DOMAINS ]) if invalid_domain: i += 1 else: if valid_price(response.meta['price'], pr['price']) and \ (lowest is None or lowest['price'] > pr['price']): lowest = pr i += 1 if lowest: yield lowest
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]' '//div[starts-with(@id, "result_")]') log.msg(">>>>>>> FOUND %s ITEMS >>>" % len(products)) pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3/a/span/text()') if not accept_product(loader.get_output_value('name')): continue loader.add_xpath('url', './/h3/a/@href') loader.add_xpath('price', './/*[@class="newp"]//span/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value("name", soup.find("h3", attrs={"class": "newaps"}).findAll("span")[0].string) loader.add_value("url", soup.find("h3", attrs={"class": "newaps"}).findAll("a")[0]["href"]) loader.add_value("price", soup.find("ul", attrs={"class": "rsltL"}).findAll("span")[0].string) # loader.add_value('sku', response.meta['sku']) # loader.add_value('identifier', response.meta['sku']) if ( loader.get_output_value("price") and (pr is None or pr.get_output_value("price") > loader.get_output_value("price")) and valid_price(response.meta["price"], loader.get_output_value("price")) ): pr = loader if pr: yield pr.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag. findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath( 'brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()' ) loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') sku = hxs.select( u'//li/b[contains(text(),"Item model number")]/../text()').extract( ) if sku: sku = sku[0].strip() else: log.msg('No sku.') csv_sku = response.meta['sku'].strip() log.msg('SKU: [%s == %s]' % (sku.lower() if sku else u'None', csv_sku)) csv_name = response.meta['name'].lower().split(u' ') site_name = loader.get_output_value('name').lower().split(u' ') log.msg(u'NAME: [%s == %s]' % (csv_name, site_name)) name_match = any(map(lambda elem: elem in site_name, csv_name)) if sku and (self.match_skus(sku, csv_sku) or self.match_skus(csv_sku, sku)) and name_match: if valid_price(response.meta['price'], loader.get_output_value('price')): loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) # if loader.get_output_value('price'): yield loader.load_item() else: meta = response.meta next_result = meta['next_results'] if next_result: next_result = next_result[0] meta['next_results'] = meta['next_results'][1:] yield Request(next_result, callback=self.parse_product, meta=response.meta) elif meta.get('next_page'): next_page = meta['next_page'] yield Request(next_page, meta=response.meta) elif meta.get('search_urls'): meta = response.meta search_url = meta['search_urls'][0] meta['search_urls'] = meta['search_urls'][1:] yield Request(search_url % {'q': meta['sku']}, meta=meta)
class GoogleSpider(BaseSpider): name = 'ldmountaincentre-google-shopping.com' allowed_domains = ['google.com'] start_urls = ['http://www.google.com'] errors = [] F_LAST_RESULTS = 'gshopping_last_results.csv' SHOPPING_URL = 'http://www.google.co.uk/shopping?hl=en' def __init__(self, *args, **kwargs): super(GoogleSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browsers = [] browser_profiles = ({ 'proxy': '77.75.105.70:22955', 'proxy-type': 'http', 'proxy-auth': 'pp-dampssno:poekensi' }, { 'proxy': '80.83.124.85:48008', 'proxy-type': 'http', 'proxy-auth': 'pp-nobfizze:hathapic' }, { 'proxy': '194.242.113.229:30230', 'proxy-type': 'http', 'proxy-auth': 'pp-dawnyrou:dupradin' }, { 'proxy': '118.127.29.47:10858', 'proxy-type': 'http', 'proxy-auth': 'pp-eyakarpe:rmsaingr' }) for profile in browser_profiles: if profile['proxy']: proxy = {} proxy['host'] = profile['proxy'] proxy['type'] = profile['proxy-type'] if profile['proxy-auth']: proxy['auth'] = profile['proxy-auth'] else: proxy = None browser = PhantomJS.create_browser(proxy=proxy) user_agent = browser.desired_capabilities[ u'phantomjs.page.settings.userAgent'] self._browsers.append({ 'webdriver': PhantomJS.create_browser(proxy=proxy), 'useragent': user_agent, 'proxy': profile['proxy'] }) self._today_result_ids = {} file_last_results = os.path.join(HERE, self.F_LAST_RESULTS) if os.path.exists(file_last_results): today = time.gmtime().tm_yday last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday if last_day == today: shutil.copy(file_last_results, '%s.bak' % file_last_results) with open(file_last_results) as f_today: reader = csv.DictReader(f_today) for row in reader: self._today_result_ids[row['identifier']] = row def spider_closed(self, spider): for browser in self._browsers: browser['webdriver'].quit() shutil.copy('data/%s_products.csv' % spider.crawl_id, os.path.join(HERE, self.F_LAST_RESULTS)) def parse(self, response): f = open(os.path.join(HERE, 'product_skus.csv')) reader = csv.DictReader(f) url = self.SHOPPING_URL # GET Google Shopping website for browser in self._browsers: self.log('\n' '>>> PROXY: %s\n' '>>> UA: %s\n' '>>> GET: %s\n' % (browser['proxy'], browser['useragent'], url)) browser['webdriver'].get(url) self.log('>>> BROWSER => OK') browsers_free = len(self._browsers) row = next(reader, None) # Search items while row is not None: # If exists today's results then it loads them if row['identifier'] in self._today_result_ids: yield self.load_item_( self._today_result_ids[row['identifier']], adurl=False) row = next(reader, None) # Next row continue if browsers_free: browsers_free -= 1 if row['sku']: search = row['sku'] self.log('>>> Search by SKU: ' + search) else: search = row['name'] self.log('>>> Search by NAME: ' + search) meta = { 'sku': row['sku'], 'price': row['price'], 'identifier': row['identifier'] } self._browsers[browsers_free]['search'] = search self._browsers[browsers_free]['meta'] = meta row = next(reader, None) # Next row if browsers_free: if row: continue else: browsers_free = len(self._browsers) for browser in self._browsers: browser['webdriver'].delete_all_cookies() time.sleep(random.choice(range(5, 25))) for browser in self._browsers: if not browser['search']: continue try: self.log( '\n' '>>> BROWSER: Clear current search and send new...\n' '>>> PROXY: %s\n' '>>> UA: %s\n' '>>> SEARCH: %s\n' % (browser['proxy'], browser['useragent'], browser['search'])) try: browser['search_input'] = browser[ 'webdriver'].find_element_by_id('gbqfq') except: browser['search_input'] = browser[ 'webdriver'].find_element_by_name('q') try: browser['search_button'] = browser[ 'webdriver'].find_element_by_id('gbqfb') except: browser['search_button'] = browser[ 'webdriver'].find_element_by_xpath( '//button[@value="Search"]') browser['search_input'].clear() browser['search_input'].send_keys(browser['search']) except Exception, e: if browser['search']: self.log('\n>>> ERROR: Failed to search %s\n' % browser['search']) browser['search'] = None # This should be a change in the website style, to save the screenshot and source and not continue browser['webdriver'].save_screenshot( os.path.join(HERE, 'browser_error.png')) with open(os.path.join(HERE, 'browser_error.html'), 'w') as f: f.write( browser['webdriver'].page_source.encode('utf-8')) raise e time.sleep(random.choice(range(5, 10))) for browser in self._browsers: if not browser['search']: continue try: self.log('\n' '>>> BROWSER: Click search button...\n' '>>> PROXY: %s\n' '>>> UA: %s\n' '>>> SEARCH: %s\n' % (browser['proxy'], browser['useragent'], browser['search'])) browser['search_button'].click() self.log('>>> BROWSER => OK') except Exception, e: self.log(e) if browser['search']: self.log('\n>>> ERROR: Failed to search %s\n' % browser['search']) browser['search'] = None time.sleep(random.choice(range(5, 10))) browsers_get_more = [] for i, browser in enumerate(self._browsers): if not browser['search']: continue browser['item'] = None try: products = browser['webdriver'].find_elements_by_xpath( '//div[@id="search"]//li[contains(@class, "g")]') link = None item_url = '' item_found = False for product in products: link = product.find_element_by_xpath( './/h3[contains(@class, "r")]/a') item_url = link.get_attribute('href') if 'ldmountaincentre' not in item_url: item_found = True break # First valid if not item_found: continue if not link: self.log('Not link') continue name = link.text try: price = product.find_element_by_xpath( './/div[@class="psliprice"]//b').text except: try: price = product.find_element_by_xpath( './/div[contains(@class, "psrpcont")]/span[@class="psrp"]' ).text except: try: price = product.find_element_by_xpath( './/div[@class="psliprice"]').text except Exception, e: self.errors.append( 'WARNING: No price searching %s' % browser['search']) # Go to shopping again browser['webdriver'].get(self.SHOPPING_URL) time.sleep(random.choice(range(5, 10))) raise e try: more_stores = re.findall( r'from \d+\+ stores', product.find_element_by_xpath( './/div[contains(@class, "psrpcont")]').text) except: try: more_stores = re.findall(r'from \d+\+ stores', product.text) except: more_stores = None item = {'name': name, 'url': item_url} if more_stores: browser['item'] = item browsers_get_more.append(i) self.log('\n' '>>> PROXY: %s\n' '>>> UA: %s\n' '>>> ITEM FOUND: %s\n' '>>> MORE STORES: %s\n' % (browser['proxy'], browser['useragent'], item['name'], item['url'])) else: item['price'] = extract_price(price) if valid_price(browser['meta']['price'], item['price']): self.log('\n' '>>> PROXY: %s\n' '>>> UA: %s\n' '>>> ITEM FOUND: %s\n' '>>> ITEM PRICE: %s\n' % (browser['proxy'], browser['useragent'], item['name'], item['price'])) yield self.load_item_(item, browser) except Exception, e: self.log('>>>> ERROR IN %s' % browser['webdriver'].current_url) self.log('>>>> %s' % e) if browser['search']: self.log('\n>>> ERROR: Failed to search %s\n' % browser['search']) browser['search'] = None
'a').get_attribute('href') except Exception, e: if browser['search']: self.log('\n>>> ERROR: Failed to search %s\n' % browser['search']) browser['search'] = None # This should be a change in the website style, to save the screenshot and source and not continue browser['webdriver'].save_screenshot( os.path.join(HERE, 'browser_error.png')) with open(os.path.join(HERE, 'browser_error.html'), 'w') as f: f.write(browser['webdriver'].page_source.encode( 'utf-8')) raise e if valid_price(browser['meta']['price'], price): item = browser['item'] item['price'] = price item['url'] = item_url yield self.load_item_(item, browser) # Set search to None for browser in self._browsers: browser['search'] = None def load_item_(self, item, browser=None, adurl=True): if browser: response = HtmlResponse(url=browser['webdriver'].current_url, body=browser['webdriver'].page_source, encoding='utf-8') else:
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') # if not products: # products = hxs.select('//div[starts-with(@id, "result_")]') pr = None search_results = [] for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/*[contains(@class, "Title") or contains(@class, "title")]//a/text()' ).extract() if not name: name = product.select( 'h3[@class="newaps"]/a/span/text()').extract() loader.add_value('name', name) url = product.select( './/*[contains(@class, "Title") or contains(@class, "title")]//a/@href' ).extract() if not url: url = product.select('h3[@class="newaps"]/a/@href').extract() loader.add_value('url', url) price = product.select( './/*[@class="newPrice"]//span[contains(@class,"price")]/text()' ).extract() if not price: price = product.select( './/div[@class="usedNewPrice"]//span[@class="price"]/text()' ).extract() if not price: price = product.select( './/ul[@class="rsltGridList"]/li[1]/a/span[@class="bld lrg red"]//text()' ).extract() if not price: price = product.select( './/ul[@class="rsltGridList"]/li[1]/a/span[@class="price bld"]//text()' ).extract() if not price: price = product.select( './/ul[@class="rsltL"]/li[1]/a/span[@class="bld lrg red"]//text()' ).extract() if not price: price = product.select( './/ul[@class="rsltL"]/li[1]/a/span[@class="price bld"]//text()' ).extract() if not price: price = product.select( './/ul[@class="rsltGridList grey"]/li[1]/a/span[@class="price bld"]//text()' ).extract() if not price: price = product.select( './/ul[@class="rsltGridList grey"]/li[1]/a/span[@class="bld lrg red"]//text()' ).extract() if not price: price = product.select( './/*[@class="newPrice"]//span/text()').extract() if not price: price = product.select( './/span[@class="bld lrg red"]//text()').extract() if price: if '-' in price[0]: price = price[0].split('-')[0] else: price = price[0] price = re.sub(u'[^\d\.,]', u'', price) price = Decimal(price.replace(',', '')) / Decimal(1.2) price = round(price, 2) loader.add_value('price', str(price)) loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) pr = loader if price and valid_price(response.meta['price'], loader.get_output_value('price')): search_results.append(pr) if search_results: search_results.sort( key=lambda elem: elem.get_output_value('price')) cur_prod = search_results[0] next_prods = search_results[1:] meta = response.meta meta['cur_prod'] = cur_prod meta['next_prods'] = next_prods yield Request(cur_prod.get_output_value('url'), callback=self.parse_product, meta=meta, dont_filter=True) elif response.meta.get('desc_req'): yield response.meta.get('desc_req')