def __init__(self, *args, **kwargs): super(VodafoneSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK')
def __init__(self, *args, **kwargs): super(GoogleSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browsers = [] browser_profiles = ({ 'proxy': '77.75.105.70:22955', 'proxy-type': 'http', 'proxy-auth': 'pp-dampssno:poekensi' }, { 'proxy': '80.83.124.85:48008', 'proxy-type': 'http', 'proxy-auth': 'pp-nobfizze:hathapic' }, { 'proxy': '194.242.113.229:30230', 'proxy-type': 'http', 'proxy-auth': 'pp-dawnyrou:dupradin' }, { 'proxy': '118.127.29.47:10858', 'proxy-type': 'http', 'proxy-auth': 'pp-eyakarpe:rmsaingr' }) for profile in browser_profiles: if profile['proxy']: proxy = {} proxy['host'] = profile['proxy'] proxy['type'] = profile['proxy-type'] if profile['proxy-auth']: proxy['auth'] = profile['proxy-auth'] else: proxy = None browser = PhantomJS.create_browser(proxy=proxy) user_agent = browser.desired_capabilities[ u'phantomjs.page.settings.userAgent'] self._browsers.append({ 'webdriver': PhantomJS.create_browser(proxy=proxy), 'useragent': user_agent, 'proxy': profile['proxy'] }) self._today_result_ids = {} file_last_results = os.path.join(HERE, self.F_LAST_RESULTS) if os.path.exists(file_last_results): today = time.gmtime().tm_yday last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday if last_day == today: shutil.copy(file_last_results, '%s.bak' % file_last_results) with open(file_last_results) as f_today: reader = csv.DictReader(f_today) for row in reader: self._today_result_ids[row['identifier']] = row
def __init__(self, *args, **kwargs): super(GoogleSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browsers = [] proxies = [ '23.19.154.246:3128', '23.19.154.247:3128', '23.19.154.248:3128', '23.19.154.249:3128', '23.19.154.250:3128', '23.19.188.246:3128', '23.19.188.247:3128', '23.19.188.248:3128', '23.19.188.249:3128', '23.19.188.250:3128' ] user_agents = cycle([ 'Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0' ]) browser_profiles = [] for proxy_ip_port in proxies: browser_profiles.append({ 'proxy': proxy_ip_port, 'proxy-type': 'http', 'proxy-auth': None }) for profile in browser_profiles: if profile['proxy']: proxy = {} proxy['host'] = profile['proxy'] proxy['type'] = profile['proxy-type'] if profile['proxy-auth']: proxy['auth'] = profile['proxy-auth'] else: proxy = None browser = PhantomJS.create_browser(proxy=proxy, user_agent=user_agents.next()) user_agent = browser.desired_capabilities[ u'phantomjs.page.settings.userAgent'] self._browsers.append({ 'webdriver': browser, 'useragent': user_agent, 'proxy': profile['proxy'] }) self._today_result_ids = {} file_last_results = os.path.join(HERE, self.F_LAST_RESULTS) if os.path.exists(file_last_results): today = time.gmtime().tm_yday last_day = time.gmtime(os.path.getctime(file_last_results)).tm_yday if last_day == today: shutil.copy(file_last_results, '%s.bak' % file_last_results) with open(file_last_results) as f_today: reader = csv.DictReader(f_today) for row in reader: self._today_result_ids[row['identifier']] = row
def __init__(self, *args, **kwargs): super(RubbermaidSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser() max_wait = 60 self._browser.set_page_load_timeout(max_wait) self._browser.set_script_timeout(max_wait)
def parse_product(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//div[@id="breadcrumb_container"]//a/text()').extract() if categories: category = categories[-1] else: category = None name = hxs.select('//div[@id="overview_tab_content"]/h2/text()').extract() if name: name = name[0].strip() if not name: name = hxs.select('//title/text()').extract()[0].split('-') name = name[0:-1] if len(name) > 1 else name name = '-'.join(name).strip() sku = hxs.select('//span[@id="product_reference"]/text()').extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('identifier', '//span[@id="product_reference"]/text()') loader.add_value('url', response.url) loader.add_value('name', name.strip()) if sku: loader.add_value('sku', sku[0].replace(' ', '')) price = hxs.select('//div[@id="product_price"]//span[@id="product_price_sale"]' '//span[@class="price"]//span[@class="ex"]//span[@class="GBP"]/text()').extract() price = re.sub(u'[^\d\.]', u'', price[0].strip()) # loader.add_value('price', str(round(Decimal(price) / Decimal(1.2), 2))) loader.add_value('price', price) if category: loader.add_value('category', category) img = hxs.select('//img[@id="product_medium_image"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = hxs.select('//div[@id="product_page_brand"]/a/@title').extract() brand = brand[0] if brand else '' loader.add_value('brand', brand) item = loader.load_item() if not item['identifier'] or item['identifier'].strip() == 'n/a': browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => response.url') browser.get(response.url) self.log('BROWSER: OK!') hxs = HtmlXPathSelector(browser.page_source) browser.quit() item['identifier'] = hxs.select('//span[@id="product_reference"]/text()').extract()[0].strip() yield item
def parse_product(self, response): browser = PhantomJS() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK!') hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() self.log('>>> BROWSER: Closed') sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract() sku = sku[0].replace('#', '') product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath( 'name', u'//div[contains(@class,"title")]//h1/text()') product_loader.add_value('sku', sku) product_loader.add_xpath( 'category', u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()') product_loader.add_value('identifier', sku) price = hxs.select( u'//div[contains(@class, "product-price__reg-price")]/text()' ).extract() product_loader.add_value('price', price[0].replace('Reg.', '')) product_loader.add_value('brand', response.meta['brand'].lower()) product_loader.add_value('url', response.url) image_url = hxs.select( u'/html/head/link[@rel="image_src"]/@href').extract() if image_url: product_loader.add_value('image_url', image_url[0]) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product brand = response.meta['brand'].lower() if brand not in product['name'] and brand not in response.body.lower(): return # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> part1 = hxs.select( u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src' ).extract()[0].split('/')[-2] part2 = hxs.select( '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0] yield Request( 'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml' % (part1, part2), meta=response.meta, callback=self.parse_review_js)
def _get_new_browser(self): proxy = None proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST, user=PROXY_SERVICE_USER, password=PROXY_SERVICE_PSWD) proxy_data = {'id': '', 'url': ''} proxy_list = proxy_service_api.get_proxy_list(self.proxy_target_id, types='https', log=self.log, length=1) if proxy_list: proxy_data = proxy_list[0] proxy_type, proxy_host = proxy_data['url'].split('://') proxy = { 'host': proxy_host, 'type': proxy_type, } user_agent = random.choice(self._all_user_agents) return PhantomJS(load_images=True, proxy=proxy, user_agent=user_agent)
def renew_browser(self, browser_profile=None, browser_blocked=False): proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST, user=PROXY_SERVICE_USER, password=PROXY_SERVICE_PSWD) blocked = [] if browser_profile: browser_profile['webdriver'].quit() if browser_blocked: blocked.append(browser_profile['proxy_id']) else: browser_profile = {} proxy = None proxy_data = {'id': '', 'url': ''} proxy_list = proxy_service_api.get_proxy_list(self.proxy_service_target_id, locations=self.proxy_service_location, types='https', blocked=blocked, log=self.log, length=1) if proxy_list: proxy_data = proxy_list[0] proxy_type, proxy_host = proxy_data['url'].split('://') proxy = { 'host': proxy_host, 'type': proxy_type, } user_agent = self.user_agents.next() browser = PhantomJS(proxy=proxy, user_agent=user_agent, load_images=False) browser_profile.update( {'webdriver': browser.driver, 'useragent': user_agent, 'proxy': proxy_data['url'], 'proxy_id': proxy_data['id']}) browser_profile['retry'] = browser_blocked if browser_blocked: browser_profile['retry_no'] = int(browser_profile.get('retry_no', 0)) + 1 else: browser_profile['retry_no'] = 0 if not browser_blocked: # Add new browser self._browsers.append(browser_profile)
def start_requests(self): browser = PhantomJS() url = 'http://www.nisbets.co.uk/Homepage.action' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) page_source = browser.driver.page_source browser.close() for req in self.parse(url, page_source): yield req
def parse(self, response): base_url = get_base_url(response) browser = PhantomJS() browser.get(response.url) hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() categories = hxs.select('//div[@id="nav-full"]//a') for category in categories: url = category.select('./@href').extract() if url: meta = response.meta category_name = category.select('./span/text()').extract() meta['category'] = category_name[0] if category_name else '' yield Request(urljoin_rfc(base_url, url[0]), meta=meta, callback=self.parse_pagination)
def parse(self, response): browser = PhantomJS() url = self.start_urls[0] self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) browser.driver.find_element_by_xpath( '//p[@class="style-inc"]//input').click() time.sleep(30) page_source = browser.driver.page_source browser.close() hxs = HtmlXPathSelector(text=page_source) for cat in hxs.select('//ul[@class="clear-after"]/li/ul/li/a'): yield Request( urljoin_rfc(url, cat.select('./@href').extract()[0]), callback=self.parse_cat, meta={'category': cat.select('./text()').extract()[0]})
def parse(self, response): # get the lastest link client = imaplib.IMAP4_SSL('imap.gmail.com', 993) client.login('totalfeedcompetitormonitor', 'uyWTStB6') client.select('INBOX') mails = client.uid('search', 'ALL')[1][0].split()[::-1] for mail_uid in mails: mail = client.uid('fetch', mail_uid, '(RFC822)') mail = email.message_from_string(mail[1][0][1]) subject = email.header.decode_header(mail['Subject'])[0][0] if 'Nouveau message' not in subject: continue body = ' '.join([m.get_payload() for m in mail.get_payload()]) url = re.search('(http.*?DownloadToken.*)', body).group(1).replace('\r', '') break browser = PhantomJS() # url = 'https://poseidon.hubtotal.net/zephyr/DownloadToken.jsp?token=iQ4rBu6SBKEB8KdOLpeO0JplfDhqJPqiIgOQrjsfuKedCnYC' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(180) page_source = browser.driver.page_source browser.close() token = urlparse.parse_qs(urlparse.urlparse(url).query)['token'][0] hxs = HtmlXPathSelector(text=page_source) link_id = hxs.select('//h3[@class="unit-name"]/a/@id').re('file_(.*)') download_link = 'https://poseidon.hubtotal.net/zephyr/MFTWebAppDownloadToken/Download?file={}&token={}'.format( link_id[0], token) yield Request(download_link, callback=self.parse_feed)
class JDSportsSpider(BaseSpider): name = u'jdsports.co.uk' allowed_domains = ['www.jdsports.co.uk'] start_urls1 = [ 'http://www.jdsports.co.uk/men/mens-footwear/brand/nike/', 'http://www.jdsports.co.uk/women/womens-footwear/brand/nike/' ] start_urls2 = [ 'http://www.jdsports.co.uk/featured/kids+nike+footwear?pageSize=9999', 'http://www.jdsports.co.uk/search/nike-skateboarding?pageSize=9999' ] def __init__(self, *args, **kwargs): super(JDSportsSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK') def spider_closed(self, spider): # Browser close self.log('>>> BROWSER: close') self._browser.close() self.log('>>> BROWSER: OK') def start_requests(self): product_urls = [] for url in self.start_urls1: self.log('>>> BROWSER: GET => %s' % url) self._browser.get(url) self.log('>>> BROWSER: OK') find_more = True while find_more: hxs = HtmlXPathSelector(text=self._browser.driver.page_source) product_urls += hxs.select( '//a[@data-perf-id="product"]/@href').extract() try: self.log('>>> BROWSER: CLICK NEXT PAGE LINK') self._browser.driver.find_element_by_xpath( '//ul[@data-component-name="pagination"]/li[contains(@class, "next")]/a' ).click() self.log('>>> BROWSER: OK') except NoSuchElementException: self.log('>>> BROWSER: NEXT PAGE NOT FOUND') find_more = False else: time.sleep(5) for url in product_urls: yield Request(url, callback=self.parse_product) for url in self.start_urls2: yield Request(url, callback=self.parse_products_list, meta={'category': ''}) def parse_categories(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) urls = hxs.select('//*[@id="Sport/Activity"]/li/a/@href').extract() categories = hxs.select('//*[@id="Sport/Activity"]/li/a/@id').extract() for url, category in zip(urls, categories): url = url.replace('fh_view_size%3d20', 'fh_view_size%3d9999') yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list, meta={'category': category}) def parse_products_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//a[@data-perf-id="product"]/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product, meta=response.meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) # page 404 if hxs.select("//img[@class='image-404']"): self.log("[WARNING] Product not found on page: %s" % response.url) return base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="infoPanel"]/h1/text()').extract()[0].strip() url = response.url loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) image_url = hxs.select('//*[@id="main"]/noscript/img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select( '//*[@id="productSummaryPrice"]/text()').extract()[0] if price == 'No price available.': return price = extract_price(price.replace(u'\xa3', '')) loader.add_value('price', price) if 'category' in response.meta: loader.add_value('category', response.meta.get('category')) else: categories = hxs.select( '//div[@class="breadcrumbs"]/a[not(contains(@class, "current"))]/text()' ).extract() if categories: loader.add_value('category', categories[-1]) identifier = hxs.select( '//div[@id="productPage"]/@data-plu').extract()[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('brand', 'Nike') if price < 60: loader.add_value('shipping_cost', 3.99) else: loader.add_value('shipping_cost', 0) yield loader.load_item()
def __init__(self, *args, **kwargs): super(UkFlooringDirectSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser()
class VodafoneSpider(VodafoneBaseSpider): name = 'vodafone-t-mobile.de' allowed_domains = ['vodafone.co.uk'] start_urls = ( 'https://www.t-mobile.de/apple-iphone/iphone-6/0,26907,28800-_,00.html', 'https://www.t-mobile.de/apple-iphone/iphone-6-plus/0,26908,28801-_,00.html?WT.svl=100', 'https://www.t-mobile.de/samsung-galaxy/samsung-galaxy-s5-lte/0,27026,28852-_,00.html?WT.svl=100' ) def __init__(self, *args, **kwargs): super(VodafoneSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK') def spider_closed(self, spider): self._browser.close() def parse(self, response): base_url = get_base_url(response) selected_option_id = response.meta.get('option_id', None) self._browser.get(response.url) container = self._browser.driver.find_element_by_xpath( '//div[@class="chosen-container chosen-container-single chosen-container-single-nosearch"]' ) container.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) if not selected_option_id: options = hxs.select( '//ul[@class="chosen-results"]/li/@data-option-array-index' ).extract() for option_id in options: yield Request(response.url, dont_filter=True, meta={'option_id': option_id}) return option = self._browser.driver.find_element_by_xpath( '//ul[@class="chosen-results"]/li[@data-option-array-index="' + selected_option_id + '"]') option.click() hxs = HtmlXPathSelector(text=self._browser.driver.page_source) tariffs = hxs.select('//li[contains(@class, "rate-element")]') device_identifier = re.search('0,(.*?)-_', response.url).group(1) for tariff in tariffs: loader = ProductLoader(item=Product(), response=response) duration = '24' identifier = tariff.select('@data-shop-id').extract() loader.add_value( 'identifier', device_identifier + '-' + selected_option_id + '-' + identifier[0]) phone_name = ' '.join( tariff.select( './/div[@class="configuration-output"]//p[not(span)]//text()' ).extract()) tariff_name = ' '.join( tariff.select( './/div[@class="heading-2"]/span[@class="title-1" or @class="title-2"]//text()' ).extract()) phone_price = ''.join( tariff.select( './/div[@class="configuration-output"]//p/span//text()'). extract()).replace(',', '.') image_url = hxs.select( '//div[@id="device-image-slider"]//li/img/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) monthly_cost = ''.join( tariff.select('.//p[@class="price monthly-price"]/span//text()' ).extract()).replace(',', '.') normalized_name = self.get_normalized_name(phone_name) loader.add_value('name', normalized_name + ' - ' + tariff_name) loader.add_value('url', response.url) loader.add_value('brand', phone_name.split()[0]) loader.add_value('price', phone_price) loader.add_value('image_url', image_url) product = loader.load_item() metadata = VodafoneMeta() metadata['device_name'] = phone_name metadata['monthly_cost'] = re.search('(\d+.\d+)', monthly_cost).group(1) metadata['tariff_name'] = tariff_name metadata['contract_duration'] = duration metadata['operator'] = operator metadata['channel'] = channel metadata['promotional_text'] = '' metadata['network_generation'] = '4G' product['metadata'] = metadata yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: product_data = json.loads( hxs.select( '//script[contains(text(), "walPP.variantDataRawArr")]/text()' ).re(r'walPP.variantDataRawArr = (\[.*\])')[0])[0] except: self.errors.append('WARNING: No product data in %s' % response.url) return price = product_data.get(u'price_store_price', None) if not price: browser = PhantomJS.create_browser() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK') time.sleep(5) hxs = HtmlXPathSelector(text=browser.page_source) browser.quit() # Monitor all products even without a price (as requested in #248) price = '.'.join( hxs.select( '//div[@id="pricing"]/div[@class="price-main"]//text()'). re(r'(\d+)')).strip() if not price: price_elem = hxs.select( '//span[@id="store-price"][1]/text()').extract() if price_elem: price = price_elem[0] if not price: store_prices = hxs.select( '//div[contains(@id, "store-")]//div[@class="price"]//text()' ).extract() try: price = '.'.join( re.findall(r'(\d+)', '.'.join(store_prices[:3]))) except: price = '0.00' else: price = price[0] product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('category', product_data[u'Category']) product_loader.add_value('name', product_data[u'prod_name_en']) product_loader.add_value('sku', product_data[u'P_RollupKey']) product_loader.add_value('price', price.replace(',', '')) product_loader.add_value('identifier', product_data[u'P_UniqueKey']) product_loader.add_value('url', response.url) product_loader.add_value('brand', response.meta['brand'].strip().lower()) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product # the same as canadiantire.ca # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> try: part2 = product['sku'] except: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: if not part2: self.errors.append('WARNING: No sku in %s' % response.url) yield product else: reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=e6wzzmz844l2kk3v6v7igfl6i&apiversion=5.4&displaycode=2036-en_ca&resource.q2=reviews&filter.q2=isratingsonly%3Aeq%3Afalse&filter.q2=productid%3Aeq%3A' + part2 yield Request(reviews_url, meta=response.meta, callback=self.parse_reviews)