def __init__(self, *args, **kwargs): super(GoogleSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browsers = [] browser_profiles = ({ 'proxy': '77.75.105.70:22955', 'proxy-type': 'http', 'proxy-auth': 'pp-dampssno:poekensi' }, { 'proxy': '80.83.124.85:48008', 'proxy-type': 'http', 'proxy-auth': 'pp-nobfizze:hathapic' }, { 'proxy': '194.242.113.229:30230', 'proxy-type': 'http', 'proxy-auth': 'pp-dawnyrou:dupradin' }, { 'proxy': '118.127.29.47:10858', 'proxy-type': 'http', 'proxy-auth': 'pp-eyakarpe:rmsaingr' }) for profile in browser_profiles: if profile['proxy']: proxy = {} proxy['host'] = profile['proxy'] proxy['type'] = profile['proxy-type'] if profile['proxy-auth']: proxy['auth'] = profile['proxy-auth'] else: proxy = None browser = PhantomJS.create_browser(proxy=proxy) user_agent = browser.desired_capabilities[ u'phantomjs.page.settings.userAgent'] self._browsers.append({ 'webdriver': PhantomJS.create_browser(proxy=proxy), 'useragent': user_agent, 'proxy': profile['proxy'] }) self._today_result_ids = {} file_last_results = os.path.join(HERE, self.F_LAST_RESULTS) if os.path.exists(file_last_results): today = time.gmtime().tm_yday last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday if last_day == today: shutil.copy(file_last_results, '%s.bak' % file_last_results) with open(file_last_results) as f_today: reader = csv.DictReader(f_today) for row in reader: self._today_result_ids[row['identifier']] = row
def __init__(self, *args, **kwargs): super(GoogleSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browsers = [] proxies = [ '23.19.154.246:3128', '23.19.154.247:3128', '23.19.154.248:3128', '23.19.154.249:3128', '23.19.154.250:3128', '23.19.188.246:3128', '23.19.188.247:3128', '23.19.188.248:3128', '23.19.188.249:3128', '23.19.188.250:3128' ] user_agents = cycle([ 'Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0' ]) browser_profiles = [] for proxy_ip_port in proxies: browser_profiles.append({ 'proxy': proxy_ip_port, 'proxy-type': 'http', 'proxy-auth': None }) for profile in browser_profiles: if profile['proxy']: proxy = {} proxy['host'] = profile['proxy'] proxy['type'] = profile['proxy-type'] if profile['proxy-auth']: proxy['auth'] = profile['proxy-auth'] else: proxy = None browser = PhantomJS.create_browser(proxy=proxy, user_agent=user_agents.next()) user_agent = browser.desired_capabilities[ u'phantomjs.page.settings.userAgent'] self._browsers.append({ 'webdriver': browser, 'useragent': user_agent, 'proxy': profile['proxy'] }) self._today_result_ids = {} file_last_results = os.path.join(HERE, self.F_LAST_RESULTS) if os.path.exists(file_last_results): today = time.gmtime().tm_yday last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday if last_day == today: shutil.copy(file_last_results, '%s.bak' % file_last_results) with open(file_last_results) as f_today: reader = csv.DictReader(f_today) for row in reader: self._today_result_ids[row['identifier']] = row
def __init__(self, *args, **kwargs): super(RubbermaidSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser() max_wait = 60 self._browser.set_page_load_timeout(max_wait) self._browser.set_script_timeout(max_wait)
def parse_product(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//div[@id="breadcrumb_container"]//a/text()').extract() if categories: category = categories[-1] else: category = None name = hxs.select('//div[@id="overview_tab_content"]/h2/text()').extract() if name: name = name[0].strip() if not name: name = hxs.select('//title/text()').extract()[0].split('-') name = name[0:-1] if len(name) > 1 else name name = '-'.join(name).strip() sku = hxs.select('//span[@id="product_reference"]/text()').extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//span[@id="product_reference"]/text()') loader.add_value('url', response.url) loader.add_value('name', name.strip()) if sku: loader.add_value('sku', sku[0].replace(' ', '')) price = hxs.select('//div[@id="product_price"]//span[@id="product_price_sale"]' '//span[@class="price"]//span[@class="ex"]//span[@class="GBP"]/text()').extract() price = re.sub(u'[^\d\.]', u'', price[0].strip()) # loader.add_value('price', str(round(Decimal(price) / Decimal(1.2), 2))) loader.add_value('price', price) if category: loader.add_value('category', category) img = hxs.select('//img[@id="product_medium_image"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = hxs.select('//div[@id="product_page_brand"]/a/@title').extract() brand = brand[0] if brand else '' loader.add_value('brand', brand) item = loader.load_item() if not item['identifier'] or item['identifier'].strip() == 'n/a': browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => response.url') browser.get(response.url) self.log('BROWSER: OK!') hxs = HtmlXPathSelector(browser.page_source) browser.quit() item['identifier'] = hxs.select('//span[@id="product_reference"]/text()').extract()[0].strip() yield item
def __init__(self, *args, **kwargs): super(UkFlooringDirectSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_data = json.loads( hxs.select( '//script[contains(text(), "walPP.variantDataRawArr")]/text()' ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0] except: self.errors.append('WARNING: No product data in %s' % response.url) return price = product_data.get(u'price_store_price', None) if not price: browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK') time.sleep(5) hxs = HtmlXPathSelector(text=browser.page_source) browser.quit() # Monitor all products even without a price (as requested in #248) price = '.'.join( hxs.select( '//div[@id="pricing"]/div[@class="price-main"]//text()'). re(r'(\d+)')).strip() if not price: price_elem = hxs.select( '//span[@id="store-price"][1]/text()').extract() if price_elem: price = price_elem[0] if not price: store_prices = hxs.select( '//div[contains(@id, "store-")]//div[@class="price"]//text()' ).extract() try: price = '.'.join( re.findall(r'(\d+)', '.'.join(store_prices[:3]))) except: price = '0.00' else: price = price[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('category', product_data[u'Category']) product_loader.add_value('name', product_data[u'prod_name_en']) product_loader.add_value('sku', product_data[u'P_RollupKey']) product_loader.add_value('price', price.replace(',', '')) product_loader.add_value('identifier', product_data[u'P_UniqueKey']) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product # the same as canadiantire.ca # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> try: part2 = product['sku'] except: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: if not part2: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2 yield Request(reviews_url, meta=response.meta, callback=self.parse_reviews)