def __init__(self, *args, **kwargs): super(VodafoneSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) # Browser self.log('>>> BROWSER: Open browser') self._browser = PhantomJS() self.log('>>> BROWSER: OK')
def parse_product(self, response): browser = PhantomJS() self.log('>>> BROWSER: GET => %s' % response.url) browser.get(response.url) self.log('>>> BROWSER: OK!') hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() self.log('>>> BROWSER: Closed') sku = hxs.select(u'//*[@class="displaySkuCode"]//text()').extract() sku = sku[0].replace('#', '') product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath( 'name', u'//div[contains(@class,"title")]//h1/text()') product_loader.add_value('sku', sku) product_loader.add_xpath( 'category', u'//ul[contains(@class, "pd-breadcrumbs")]/li[2]/a/text()') product_loader.add_value('identifier', sku) price = hxs.select( u'//div[contains(@class, "product-price__reg-price")]/text()' ).extract() product_loader.add_value('price', price[0].replace('Reg.', '')) product_loader.add_value('brand', response.meta['brand'].lower()) product_loader.add_value('url', response.url) image_url = hxs.select( u'/html/head/link[@rel="image_src"]/@href').extract() if image_url: product_loader.add_value('image_url', image_url[0]) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = response.meta['brand'] metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product brand = response.meta['brand'].lower() if brand not in product['name'] and brand not in response.body.lower(): return # http://www.canadiantire.ca/AST/browse/2/OutdoorLiving/3/OutdoorStorage/Sheds/PRD~0600292P/Keter+Rattan+Vertical+Shed.jsp?locale=en # http://canadiantire.ugc.bazaarvoice.com/9045/0600292P/reviews.djs?format=embeddedhtml # <script language="JavaScript" src="http://canadiantire.ugc.bazaarvoice.com/static/9045/bvapi.js" type="text/javascript"></script> part1 = hxs.select( u'//script[starts-with(@src,"http://canadiantire.ugc.bazaarvoice.com/static/")]/@src' ).extract()[0].split('/')[-2] part2 = hxs.select( '//div[@id="bazaarVoiceConfig"]/@data-product-code').extract()[0] yield Request( 'http://canadiantire.ugc.bazaarvoice.com/%s/%s/reviews.djs?format=embeddedhtml' % (part1, part2), meta=response.meta, callback=self.parse_review_js)
def start_requests(self): browser = PhantomJS() url = 'http://www.nisbets.co.uk/Homepage.action' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) page_source = browser.driver.page_source browser.close() for req in self.parse(url, page_source): yield req
def parse(self, response): base_url = get_base_url(response) browser = PhantomJS() browser.get(response.url) hxs = HtmlXPathSelector(text=browser.driver.page_source) browser.close() categories = hxs.select('//div[@id="nav-full"]//a') for category in categories: url = category.select('./@href').extract() if url: meta = response.meta category_name = category.select('./span/text()').extract() meta['category'] = category_name[0] if category_name else '' yield Request(urljoin_rfc(base_url, url[0]), meta=meta, callback=self.parse_pagination)
def _get_new_browser(self): proxy = None proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST, user=PROXY_SERVICE_USER, password=PROXY_SERVICE_PSWD) proxy_data = {'id': '', 'url': ''} proxy_list = proxy_service_api.get_proxy_list(self.proxy_target_id, types='https', log=self.log, length=1) if proxy_list: proxy_data = proxy_list[0] proxy_type, proxy_host = proxy_data['url'].split('://') proxy = { 'host': proxy_host, 'type': proxy_type, } user_agent = random.choice(self._all_user_agents) return PhantomJS(load_images=True, proxy=proxy, user_agent=user_agent)
def renew_browser(self, browser_profile=None, browser_blocked=False): proxy_service_api = ProxyServiceAPI(host=PROXY_SERVICE_HOST, user=PROXY_SERVICE_USER, password=PROXY_SERVICE_PSWD) blocked = [] if browser_profile: browser_profile['webdriver'].quit() if browser_blocked: blocked.append(browser_profile['proxy_id']) else: browser_profile = {} proxy = None proxy_data = {'id': '', 'url': ''} proxy_list = proxy_service_api.get_proxy_list(self.proxy_service_target_id, locations=self.proxy_service_location, types='https', blocked=blocked, log=self.log, length=1) if proxy_list: proxy_data = proxy_list[0] proxy_type, proxy_host = proxy_data['url'].split('://') proxy = { 'host': proxy_host, 'type': proxy_type, } user_agent = self.user_agents.next() browser = PhantomJS(proxy=proxy, user_agent=user_agent, load_images=False) browser_profile.update( {'webdriver': browser.driver, 'useragent': user_agent, 'proxy': proxy_data['url'], 'proxy_id': proxy_data['id']}) browser_profile['retry'] = browser_blocked if browser_blocked: browser_profile['retry_no'] = int(browser_profile.get('retry_no', 0)) + 1 else: browser_profile['retry_no'] = 0 if not browser_blocked: # Add new browser self._browsers.append(browser_profile)
def parse(self, response): browser = PhantomJS() url = self.start_urls[0] self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(120) browser.driver.find_element_by_xpath( '//p[@class="style-inc"]//input').click() time.sleep(30) page_source = browser.driver.page_source browser.close() hxs = HtmlXPathSelector(text=page_source) for cat in hxs.select('//ul[@class="clear-after"]/li/ul/li/a'): yield Request( urljoin_rfc(url, cat.select('./@href').extract()[0]), callback=self.parse_cat, meta={'category': cat.select('./text()').extract()[0]})
def parse(self, response): # get the lastest link client = imaplib.IMAP4_SSL('imap.gmail.com', 993) client.login('totalfeedcompetitormonitor', 'uyWTStB6') client.select('INBOX') mails = client.uid('search', 'ALL')[1][0].split()[::-1] for mail_uid in mails: mail = client.uid('fetch', mail_uid, '(RFC822)') mail = email.message_from_string(mail[1][0][1]) subject = email.header.decode_header(mail['Subject'])[0][0] if 'Nouveau message' not in subject: continue body = ' '.join([m.get_payload() for m in mail.get_payload()]) url = re.search('(http.*?DownloadToken.*)', body).group(1).replace('\r', '') break browser = PhantomJS() # url = 'https://poseidon.hubtotal.net/zephyr/DownloadToken.jsp?token=iQ4rBu6SBKEB8KdOLpeO0JplfDhqJPqiIgOQrjsfuKedCnYC' self.log('>>> BROWSER: GET => %s' % url) browser.get(url) self.log('>>> BROWSER: OK') time.sleep(180) page_source = browser.driver.page_source browser.close() token = urlparse.parse_qs(urlparse.urlparse(url).query)['token'][0] hxs = HtmlXPathSelector(text=page_source) link_id = hxs.select('//h3[@class="unit-name"]/a/@id').re('file_(.*)') download_link = 'https://poseidon.hubtotal.net/zephyr/MFTWebAppDownloadToken/Download?file={}&token={}'.format( link_id[0], token) yield Request(download_link, callback=self.parse_feed)