def __init__(self): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) print(APP_ROOT) self.req = 0 self.driver = PhantomJS(APP_ROOT + "/phantomjs", service_log_path=os.path.devnull) self.driver.implicitly_wait(3)
def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网'
class Parser(object): def __init__(self): self.browser = PhantomJS() def cleanup(self): self.browser.quit()
def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15)
def test_plotly(remove_build): """Tests plotly.""" viz = Plotly() ctrl = Nouislider() ctrl2 = Button() path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'build') layout = Layout(directory=path) layout.add(viz) layout.add_sidebar(ctrl) layout.add_sidebar(ctrl2) layout.subscribe(callback, ctrl.on_change) layout.subscribe(callback, ctrl2.on_click) layout.build() env = os.environ env['PYTHONPATH'] = '{}:{}'.format(os.getcwd(), os.environ.get('PYTHONPATH', '')) server = subprocess.Popen(os.path.join(path, 'src/server.py'), env=env) time.sleep(5) driver = PhantomJS() driver.get('http://localhost:9991') assert driver.title == 'Bowtie App' server.kill()
def export(plot, filename, width=800, height=600): """ Export plot to file. Args: plot (quorra.Plot): Quorra plot object to export. width (int): Width for plot (pixels). height (int): Height for plot (pixels). filename (str): Filename to export to. """ global _phantom, __templates__, __cwd__ if _phantom is None: from selenium.webdriver import PhantomJS _phantom = PhantomJS(service_log_path=os.path.devnull) tmpl = os.path.join(__templates__, 'export.html') exp = os.path.join(__cwd__, '.' + str(uuid.uuid1()) + '.html') try: with open(tmpl, 'r') as fi, open(exp, 'w') as fo: dat = fi.read() dat = dat.replace('var plot = undefined;', 'var plot = {};'.format(str(plot))) dat = dat.replace('width: 800px;', 'width: {}px;'.format(width)) dat = dat.replace('height: 500px;', 'height: {}px;'.format(height)) fo.write(dat) _phantom.get('file://' + exp) _phantom.save_screenshot(filename.replace('.png', '') + '.png') finally: if os.path.exists(exp): os.remove(exp) return
def chrome(self, chromedriver_path=None, disable_log=True, strip_ua4headless=True): """ Better to place chromedriver and chrome/chromium binaries in the PATH, in this case, parameter chromedriver_path could be omitted and set as None Otherwise place them under the same directory and set parameter chromedriver_path --------------------------------------------------------------------------------- If chromedriver and chrome/chromium are in different path, beyond chromedriver_path setting, chrome/chromium path should be set as: options.binary_location = '/path' """ options = ChromeOptions() options.add_argument('headless') options.add_argument('no-sandbox') if disable_log: options.add_argument('log-level=3') options.add_experimental_option('excludeSwitches', ['enable-logging']) try: if chromedriver_path: self.driver = Chrome(options=options, executable_path=chromedriver_path) else: self.driver = Chrome(options=options) except WebDriverException as e: logger.error(e.msg) self.driver = None return # self.driver.set_page_load_timeout(20) if strip_ua4headless: import re ua = re.sub('(?i)headless', '', self.ua()) self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua})
def main(): steam_id, api, return_amount, user_categories = read_config_values() print("SteamID:", steam_id) print("API key:", api) print("Return amount:", return_amount) if len(user_categories): # > 0 check_user_categories_validity(user_categories) print("Categories:", "; ".join(user_categories)) print() print("Fetching your Steam library..") user_library = fetch_user_library(api, steam_id) print("Found {} in your library.".format(len(user_library))) print("Opening PhantomJS..") driver = PhantomJS(cwd + r"\dependencies\phantomJS\phantomjs.exe", service_log_path=cwd + r"\dependencies\phantomJS\ghostdriver.log") print("Opening SteamDB..") output = fetch_sales(driver, user_library, return_amount, user_categories) driver.quit() with open("games.txt", 'w', encoding='utf-8') as file: file.write(output) input("\nDone. I also wrote the games to a text file.")
def scrape_statuses(self): headless_browser = PhantomJS() headless_browser.get(MTA_URL) soup = BeautifulSoup(headless_browser.page_source, "html.parser") for line_name in LINES: line = self.get_line(soup, line_name) self.lines.append(line)
def __init__(self, url, phantomjs=None, resolution=None, ya_class=None, screen_path=None, screen_pattern=None, csv_path=None): self.url = url self.phantomjs = phantomjs or DEFAULT_PHANTOMJS assert os.path.isfile(self.phantomjs), "phantomjs не найден" resolution = resolution or FULLHD assert isinstance(resolution, (list, tuple)) assert len(resolution) == 2 self.ya_class = ya_class or DEFAULT_YA_CLASS self.screen_path = screen_path or PATH self.screen_pattern = screen_pattern or '%s.png' assert '%s' in self.screen_pattern self.csv_path = csv_path or os_join(PATH, 'statistic.csv') self.driver = PhantomJS(self.phantomjs) self.driver.set_window_size(*resolution)
def create_driver(self): if 1: caps = DesiredCapabilities().FIREFOX.copy() profile_path = path.expanduser( '~') + '/.mozilla/firefox/' + self.account['name'] # caps['proxy'] = { caps['moz:firefoxOptions'] = { "args": ["-profile", profile_path], # geckodriver 0.18+ } profile = FirefoxProfile(profile_path) #profile.set_preference("general.useragent.override", 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0') self.driver = Firefox(profile, capabilities=caps) #self.driver = Firefox(profile) else: # PhantomJS # https://github.com/detro/ghostdriver caps = DesiredCapabilities().PHANTOMJS caps["phantomjs.page.settings.userAgent"] = \ 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' service_args = [ '--proxy={}'.format(':'.join( self.account['Proxy'].split(':')[:2])), '--proxy-type=http', ] print(service_args) self.driver = PhantomJS(service_args=service_args, capabilities=caps) self.driver.set_window_size(1120, 550)
def is_logged_in(browser: PhantomJS): browser.get('https://tiki.vn/sales/order/history/') full_name = browser.find_element_by_css_selector('.profiles > h6:nth-child(3)') if full_name.text: logger.info("You has been login with name : {}".format(full_name.text)) return True else: return False
def __init__(self): dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置userAgent dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 " ) self.driver = PhantomJS( executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap)
def payment(browser: PhantomJS, item_name): logger.info('Payment : {}'.format(item_name)) WebDriverWait(driver=browser, timeout=10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#btn-placeorder'))) browser.find_element_by_css_selector( '.method_payment_cod > div:nth-child(1) > label:nth-child(1) > div:nth-child(1) > ins:nth-child(2)').click() browser.find_element_by_css_selector('#btn-placeorder').click() screen_shot(browser, 'payment.png', item_name=item_name)
def getHtmlSource(url, time=10): driver = PhantomJS(service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false' ]) driver.get(url) WebDriverWait(driver, timeout=time) source = driver.page_source #driver.save_screenshot('a.png') return source
def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS()
class Crawler: def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'): self.timeout = timeout if use_cfg_file: phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)] else: _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server) phantomjs_service_args = [ '--proxy-type={}'.format(proxy_type), '--proxy={}'.format(proxy), '--proxy-auth={}'.format(proxy_auth), ] self.driver = PhantomJS( desired_capabilities=self.new_desired_capabilities(), service_args=phantomjs_service_args) self.check_client_info() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.driver.quit() @contextmanager def wait_for_page_load(self, old_element): yield WebDriverWait(self.driver, self.timeout).until(EC.staleness_of(old_element)) def new_desired_capabilities(self, user_agent=default_ua): desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() if not user_agent: user_agent = ua.random desired_capabilities["phantomjs.page.settings.userAgent"] = user_agent return desired_capabilities def check_client_info(self): url='http://www.whoishostingthis.com/tools/user-agent/' self.driver.get(url) ip_addr = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[2]/span').text.strip() user_agent = get_xpath_element(self.driver, '//*[@id="user-agent"]/div[1]').text.strip() logger.info('IP: {}, User-Agent: {}'.format(ip_addr, user_agent)) if self.wrong_ip(ip_addr): logger.error('Proxy not set correctly!') sys.exit(-1) def wrong_ip(self, ip_addr): if ip_addr.startswith('166.111.') or ip_addr.startswith('59.66.') or ip_addr.startswith('101.5.') or ip_addr.startswith('101.6.'): return True else: return False
def main(): driver = PhantomJS() scraper = NLScraper(driver, year=2014) print(sys.argv[1]) writer = unicodecsv.DictWriter(open(sys.argv[1], 'w'), ('amount', 'scheme', 'year', 'country', 'currency', 'recipient_name', 'recipient_postcode', 'recipient_id', 'recipient_location')) writer.writeheader() try: scraper.start(writer) finally: driver.quit()
def __init__(self): """ Default constructor ARGS: None RETURNS: None """ self.browser = PhantomJS(executable_path='./drivers/phantomjs', port=free_port()) # Optional argument, if not specified will search path. self.timeout = 5 # seconds
def setUp(self): self.driver = PhantomJS() self.user = User.objects.create_user('admin', '*****@*****.**', 'pass') self.user.save() self.provider = Provider( name='provider', user=self.user, ) self.provider.save() self.provider_adverts = mommy.make(Advertisement, _quantity=20, provider=self.provider)
def onegoogolePR(self, url): '''返回单个PR''' prUrl = 'http://pr.chinaz.com' # 谷歌PR查询地址 driver = PhantomJS() driver.get(prUrl) driver.find_element_by_id('PRAddress').send_keys(url) driver.find_element_by_class_name('search-write-btn').click() try: imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src') pr = search(r'\d', imgsrc).group() except: pr = '暂无数据' driver.quit() return pr
def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids
def run(self): """Starts the crawling process the listed websites. The results queue will start filling up with image URLs. """ self.__running = True # Open up all browser windows for i in range(self.__browser_instance_cnt): if not self.__running: break # End prematurely browser = Driver(executable_path=paths.driver) # Set up the browser to be closable self.__browser_close_methods.append(browser.quit) # Set the page timeout browser.set_page_load_timeout(self.__load_timeout) self.__browser_pool.put(browser) crawl_threads = [] # Starts crawling the page and returns the given browser to the pool # when finished def crawl_and_return_to_pool(url, browser): progress_weight = (1 / len(self.__website_list)) * 100 self._crawl_page(url, browser, progress_weight) self.__browser_pool.put(browser) # Start crawling each URL for url in self.__website_list: if not self.__running: break # End prematurely # Wait for an unused browser instance browser = self.__browser_pool.get() # Start crawling thread = Thread(target=crawl_and_return_to_pool, args=(url, browser)) thread.start() crawl_threads.append(thread) # Wait for crawling to finish for thread in crawl_threads: thread.join() self._close_browsers() self.__running = False self.__is_finished = True
class KeywordTool(object): sources = {'google', 'youtube', 'bing', 'amazon', 'ebay', 'app-store'} def __init__(self, source='google', timeout=5): self.source = source self.base_url = None self.timeout = timeout self.driver = PhantomJS() self.driver.get(self.base_url) def search(self, search_term): if self.current_url != self.base_url: self.source = self.source # forces page load self.driver.find_element_by_xpath( '//input[@id="edit-keyword"]').send_keys(search_term) self.driver.find_element_by_xpath( '//button[@id="edit-submit"]').click() """Wait for at least one element to load. In practice, most of them load. You can't get them all without scrolling.""" element_not_present = EC.invisibility_of_element_located( (By.XPATH, '//td[@class="col-keywords"]//div')) WebDriverWait(self.driver, self.timeout).until(element_not_present) def parse(self): tree = html.fromstring(self.driver.page_source) L = tree.xpath('//td[@class="col-keywords"]//text()') L = map(lambda s: s.strip(), ''.join(L).split('\n')) return [s for s in L if s] def get_keywords(self, search_term, source='google'): if self.source != source: self.source = source self.search(search_term) return self.parse() @property def source(self): return self._source @source.setter def source(self, val): self._source = val if val in self.sources else 'google' if 'driver' in self.__dict__: self.driver.get(self.base_url) @property def base_url(self): return ''.join(['https://keywordtool.io/', self.source]) @base_url.setter def base_url(self, val): pass @property def current_url(self): return self.driver.current_url @current_url.setter def current_url(self, val): pass
def __init__(self, login, password, userAgent=LINUX_USER_AGENT): ''' Constructor :param login: :param password: ''' self.login = login self.password = password dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (userAgent) self.driver = PhantomJS(desired_capabilities=dcap) self.driver.set_window_size(1366, 768)
def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15)
def get_selenium(**kwargs): driver = getattr(settings, 'TEST_SELENIUM_DRIVER', 'firefox') if driver in ('chrome', 'chrome_headless'): from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver options = Options() if driver == 'chrome_headless': options.add_argument('headless') options.add_argument('disable-gpu') for key, val in kwargs.items(): if val is not None: options.add_argument('{key}={val}'.format(key=key, val=val)) else: options.add_argument('{key}'.format(key=key)) driver_path = os.environ.get('CHROME_DRIVER_PATH', None) if driver_path is not None: return ChromeDriver(driver_path, options=options) return ChromeDriver(options=options) elif driver == 'phantomjs': from selenium.webdriver import PhantomJS return PhantomJS() else: from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver return FirefoxDriver()
def get_crawler(self, dynamic): crawler = None if (dynamic): crawler = PhantomJS( "/Users/mac/Desktop/Web scraping/phantomjs-2.1.1-macosx/bin/phantomjs" ) return crawler
def get_driver(self): # Start a new browser and return the WebDriver browser_name = self.config.get('selenium', 'browser') if browser_name == 'firefox': from selenium.webdriver import Firefox browser_binary = FirefoxBinary() driver = Firefox(firefox_binary=browser_binary) driver._is_remote = False # Workaround for http://stackoverflow.com/a/42770761/489916 return driver if browser_name == 'chrome': from selenium.webdriver import Chrome return Chrome() if browser_name == 'phantomjs': from selenium.webdriver import PhantomJS return PhantomJS() # @TODO: Add chrome raise RuntimeError('Unsupported/unknown browser')
def get_driver(self): # Start a new browser and return the WebDriver browser_name = self.config.get('selenium', 'browser') if browser_name == 'firefox': from selenium.webdriver import Firefox browser_path = self.config.get('selenium', 'firefox_path') browser_binary = FirefoxBinary(browser_path) return Firefox(firefox_binary=browser_binary) if browser_name == 'chrome': from selenium.webdriver import Chrome return Chrome() if browser_name == 'phantomjs': from selenium.webdriver import PhantomJS return PhantomJS() # @TODO: Add chrome raise RuntimeError('Unsupported/unknown browser')
def run_get_logic(driver: PhantomJS, command_id, token): if not token: return {"code": 103, "public": "Session troubles!"} driver.add_cookie({ 'name': 'token', 'value': token, 'domain': "." + command_id.split(":")[0], 'path': '/' }) driver.get("http://{}/cabinet".format(command_id)) try: flag_there = driver.find_element_by_xpath('//html//body//div//h5//i') flag_container = flag_there.get_attribute('innerHTML') return flag_container except NoSuchElementException as e: return "error_no_flag_in_cabinet"
def phantomjs_driver(request, capabilities, driver_path): """Return a WebDriver using a PhantomJS instance""" kwargs = {} if capabilities: kwargs['desired_capabilities'] = capabilities if driver_path is not None: kwargs['executable_path'] = driver_path return PhantomJS(**kwargs)
def selenium(self, webdriverOption=0): """ # 调用浏览器下载,适用于任何情形 :return: """ if not self.url[:4] == "http": return None driver = None if webdriverOption == 0: from selenium.webdriver import PhantomJS driver = PhantomJS() elif webdriverOption == 1: from selenium.webdriver import Chrome driver = Chrome() elif webdriverOption == 2: from selenium.webdriver import Firefox driver = Firefox() if not driver: print(u"-->DownLoader->Selenium driver初始化出错,请检查运行环境或webdriverOption选项") driver.get(self.url) src = driver.page_source driver.quit() self.pageSource = src return src
def run_get_logic(driver: PhantomJS, comand_id, post, flag, cookies): if 'sessions' not in cookies: return {"code": MUMBLE, "public": "Session troubles!"} driver.add_cookie({ 'name': 'sessions', 'value': cookies['sessions'], 'domain': "." + comand_id.split(":")[0], 'path': '/' }) driver.get("http://{}/{}".format(comand_id, post)) try: flag_there = driver.find_element_by_xpath('//li/a[@href="#"]') flag_container = flag_there.get_attribute('innerHTML') if flag in flag_container: return {"code": OK} else: return {"code": CORRUPT, "public": "Can't find my private data!"} except NoSuchElementException: return {"code": CORRUPT, "public": "Can't find my private data!"}
class Client: def __init__(self, ig_id): self.b = PhantomJS() self.ig_id = ig_id self.b.get('https://instagram.com/%s' % ig_id) def close(self): self.b.close() def get_media(self) -> list: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['PostPage'][0] g = pp['graphql'] sc = g['shortcode_media'] if sc['__typename'] == 'GraphSidecar': edges = sc['edge_sidecar_to_children']['edges'] medias = list( map( lambda x: { 'id': x['node']['id'], 'url': x['node']['display_url'], 'caption': x['node']['accessibility_caption'] }, edges)) elif sc['__typename'] == 'GraphImage': medias = [{ 'id': sc['id'], 'url': sc['display_url'], 'caption': sc['accessibility_caption'] }] return list( filter( lambda x: 'person' in x['caption'] or 'people' in x['caption'], medias)) def get_user(self) -> dict: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['ProfilePage'][0] g = pp['graphql'] return g['user'] def get_posts(self) -> set: ps = self.b.find_elements_by_css_selector('a[href^="/p/"]') return set(map(lambda x: x.get_attribute('href'), ps)) def scroll(self): self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
class SeleniumMiddleware: driver = PhantomJS() def process_request(self, request, spider): spider.driver = self.driver self.driver.get(request.url) return HtmlResponse(self.driver.current_url, body=self.driver.page_source, encoding='utf-8', request=request)
def get_selenium(): driver = getattr(settings, 'TEST_SELENIUM_DRIVER', 'firefox') if driver == 'chrome': from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver return ChromeDriver() elif driver == 'phantomjs': from selenium.webdriver import PhantomJS return PhantomJS() else: from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver return FirefoxDriver()
def catalog_url(url='http://www.meitun.com/'): # catalog_url is AJAX,use phantomJS driver = PhantomJS() driver.get(url) driver.maximize_window() mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)') # the mouse move to the lazy layout element,and perform ActionChains(driver).move_to_element(mov_ele).perform() time.sleep(3) response = driver.page_source driver.quit() # use pyquery parser the page source,more quickly d = pq(response) return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
def on_start_again(self, url): driver = PhantomJS() driver.get(url) time.sleep(2) driver.maximize_window() t = driver.find_element_by_css_selector('.page-txt').text res_t = [] if t: t = int(t.split('/')[1][:-1]) - 1 # get the page count # the count of page turning should be i-1 while t: t -= 1 move_ele = driver.find_element_by_css_selector('#next') ActionChains(driver).move_to_element(move_ele).click() time.sleep(1) res_t.append(driver.page_source) driver.quit() for item in res_t: self.step_first(item)
def render(gist_id, commit): block_url = 'http://bl.ocks.org/' + gist_id d3_block_rec = {'gist_id': gist_id} try: driver = PhantomJS() driver.get(block_url) time.sleep(RENDER_DELAY) # let it render fullpage_im = Image.open(BytesIO(driver.get_screenshot_as_png())) fimb = BytesIO() fullpage_im.save(fimb, 'png') d3_block_rec['fullpage_base64'] = base64.b64encode(fimb.getvalue()) d3_block_rec['block_url'] = driver.current_url except Exception as e: # we got nothing with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(10) try: f = driver.find_element_by_xpath('//iframe') x, y = int(f.location['x']), int(f.location['y']) w, h = x + int(f.size['width']), y + int(f.size['height']) block_im = fullpage_im.crop((x, y, w, h)) bimb = BytesIO() block_im.save(bimb, 'png') d3_block_rec['block_base64'] = base64.b64encode(bimb.getvalue()) d3_block_rec['block_size'] = list(block_im.size) except Exception as e: # at least we got the fullpage im, save it with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: d3_block_rec['error'] = str(e) pg.insert('d3_block', values=d3_block_rec) exit(11) # all good, save everything with LittlePGer('dbname=' + DB_NAME, commit=commit) as pg: pg.insert('d3_block', values=d3_block_rec)
def __init__(self, timeout=20, phantomjs_cfg_file='python-utils/config/phantomjs_cfg.json', use_cfg_file=False, proxy_pool_server='http://127.0.0.1:15110'): self.timeout = timeout if use_cfg_file: phantomjs_service_args = ['--config={}'.format(phantomjs_cfg_file)] else: _, proxy_type, proxy, proxy_auth = get_proxy(proxy_pool_server) phantomjs_service_args = [ '--proxy-type={}'.format(proxy_type), '--proxy={}'.format(proxy), '--proxy-auth={}'.format(proxy_auth), ] self.driver = PhantomJS( desired_capabilities=self.new_desired_capabilities(), service_args=phantomjs_service_args) self.check_client_info()
def __init__(self, login, password, userAgent=LINUX_USER_AGENT): ''' Constructor :param login: :param password: ''' self.login = login self.password = password dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( userAgent ) self.driver = PhantomJS(desired_capabilities=dcap) self.driver.set_window_size(1366, 768)
class Premiumgeneratorlink(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://premiumgeneratorlink.com/') self.browser.find_element_by_name('link').send_keys(self.url) self.browser.find_element_by_xpath('//a[@class="input"]').click() wdw = WebDriverWait(self.browser, 10) wdw.until(EC.element_to_be_clickable((By.ID, 'check'))).click() wdw.until(EC.element_to_be_clickable((By.ID, 'generate'))).click() link = wdw.until(EC.visibility_of_element_located((By.XPATH, '//form[@class="center"]'))).get_attribute('action') except (WebDriverException, NoSuchElementException, TimeoutException): return False finally: self.browser.quit() return link
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser() parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html') args = parser.parse_args(argv) url = args.url browser = WebDriver() browser.get(url) tags = browser.find_elements_by_css_selector('li') for tag in tags: print(tag.text) browser.close()
def post(self): id = request.values['page'] page = Page.objects.get_or_404(id=id) # html = requests.get(page.baseurl).text screenshot = None try: phantom = PhantomJS(desired_capabilities={'acceptSslCerts': True}, service_args=['--web-security=false', '--ssl-protocol=any', '--ignore-ssl-errors=true'], port=8888) phantom.set_window_size(1024, 768) phantom.get(page.baseurl) html = phantom.page_source screenshot = phantom.get_screenshot_as_png() phantom.close() except Exception as ex: html = "error when i snap your page ... %s" % ex snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) return jsonify({'id': "%s" % snap.id})
class Leecherus(object): def __init__(self, url): self.url = url self.browser = PhantomJS() def get_link(self): try: self.browser.get('http://leecher.us') wdw = WebDriverWait(self.browser, 10) wdw.until(EC.visibility_of_element_located((By.NAME, 'link'))).send_keys(self.url) wdw.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="subscribe"]'))).click() wdw.until(EC.element_to_be_clickable((By.XPATH, '//input[@class="subscribe"]'))).click() self.browser.switch_to_window(self.browser.window_handles[1]) onclick = wdw.until(EC.element_to_be_clickable((By.ID, 'get_link'))).get_attribute('onclick') except (WebDriverException, NoSuchElementException, TimeoutException, IndexError): return False finally: self.browser.quit() m = re.search("'(http://[^']+)'", onclick) return m.group(1) if m else False
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
def __init__(self): self.driver = PhantomJS() self.driver.set_window_size(1120, 550)
def __init__(self, url): self.url = url self.browser = PhantomJS()
inputfile.seek(0) setUrlDefined = set() try: outputfile = open(cityName + '-instagram-output.csv', 'r') for line in outputfile: linesplited = line.replace('\n', '').split(',') setUrlDefined.add(linesplited[0]) outputfile.close() except IOError: pass outputfile = open(cityName + '-instagram-output.csv', 'a', 0) print colorama.Back.RED+colorama.Fore.YELLOW+str(len(setUrlDefined))+' URLs already defined! Lets Rock more now...'+colorama.Back.RESET+colorama.Fore.RESET driver = PhantomJS('./phantomjs') # in case of PhantomJS not available, we can use Firefox for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True): try: idtweet, url = line.replace('\n', '').split(',') if idtweet in setUrlDefined: continue except IndexError: print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET continue try: driver.get(url) placetag = driver.find_element_by_class_name('_kul9p') placeurl = placetag.get_attribute('href').encode('utf-8') placename = placetag.get_attribute('title').encode('utf-8') usernametag = driver.find_element_by_class_name('_4zhc5')
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : "+str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log("%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request) self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction") except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
def _init_robot(self, id): robot = WDriver() logging.debug("initialize") self.robots.update({str(id): robot}) logging.debug("get facebook.com") robot.get('http://fb.com') logging.debug("login") robot.find_element_by_name('email').send_keys('*****@*****.**') robot.find_element_by_name('pass').send_keys('2855930022040') robot.find_element_by_name('pass').send_keys(Keys.RETURN) for index in range(len(self.remain_ids)): self.lock.acquire() user_id = self.remain_ids.pop() self.lock.release() try: self.get_name_for_id(robot, user_id) except: logging.debug("error while updating record with id=%s" % str(user_id)) self.error_ids.add(user_id) else: self.done_ids.add(user_id) robot.close() return
def get_ratio_data(): import socket import re import time import dryscrape import webkit_server from random import randint from fake_useragent import UserAgent from bs4 import BeautifulSoup from selenium.webdriver import PhantomJS from app.models import Company, Indicators from app.utils import cash_to_float, depercentize # Dict item with list: element attribute, attribute value to look for, optional transform function indicators = {'roe': { 'attribute': 'data-reactid', 'value': re.compile(".*RETURN_ON_EQUITY\.1$"), 'transform': depercentize, }, 'fcf': { 'attribute': 'data-reactid', 'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"), 'transform': cash_to_float, }, } ua = UserAgent() #with open("10.csv", "r") as f: #with open("sp500-2.csv", "r") as f: with open("10_stocks", "r") as f: data = f.read() symbols = [] for i in data.split("\n"): if i: symbols.append(i.split(",")[0]) print("Iterate through symbols") ## dryscrape #session = dryscrape.Session() #session.set_header('User-Agent', ua.random) #session.set_timeout(5) for symbol in symbols: print("{} Fetching {} :".format(time.strftime("%H:%M:%S"), symbol)) import pdb; pdb.set_trace() #driver = MyWebDriver() driver = PhantomJS() driver.set_window_size(1120, 550) driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) ##try: ## session = dryscrape.Session() ##except socket.error as e: ## print("Failed to configure session {}".format(e)) ## continue ##session.set_header('User-Agent', ua.random) ##session.set_timeout(30) #try: # #session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) #except Exception as e: # print e, "try once more......" # session.reset() # time.sleep(5) # session = dryscrape.Session() # #session.set_header('User-Agent', ua.random) # try: # session.set_timeout(5) # session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) # except Exception as e: # print e, "done trying..." # session.reset() # time.sleep(2) # session = dryscrape.Session() # continue #except socket.error as e: # print("Failed to get {}, {} (1)".format(symbol, e)) # continue #except webkit_server.EndOfStreamError as e: # print("Failed to get {}, {}, breaking (2)".format(symbol, e)) # continue #except webkit_server.InvalidResponseError as e: # print("Failed to get {}, {}, breaking (3)".format(symbol, e)) # continue #response = session.body() #soup = BeautifulSoup(response, "lxml") with open("{}.out".format(symbol), "w") as f: f.write(driver.page_source.encode('utf-8')) soup = BeautifulSoup(driver.page_source, "lxml") d = {'symbol': symbol} for indicator in indicators.keys(): curr_ind = indicators[indicator] s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']}) print indicator, s for element in s: if curr_ind.has_key('transform'): f = curr_ind['transform'] #print(f(element.text)) d[indicator] = f(element.text) else: #print(element.text) d[indicator] = element.text try: db.session.add(Indicators.from_json(d)) db.session.commit() except (IntegrityError, UnmappedInstanceError) as e: print "Caught", e db.session.rollback() print "indicators", d
class Client(object): """Client HTTP pour tester fonctionnellement Strass Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js, et quelques paramètres spécifiques à Strass.""" def __init__(self): self.driver = PhantomJS() self.driver.set_window_size(1120, 550) def __del__(self): self.driver.quit() def get(self, query=None): server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000') url = server + (query or '/') self.driver.get(url) return self def find(self, selector): return self.driver.find_element_by_css_selector(selector) def click(self, selector): self.find(selector).click() return self def fill(self, selector, value): if isinstance(value, datetime.date): self.fill(selector + ' input.day', str(value.day)) self.fill(selector + ' input.month', str(value.month)) self.fill(selector + ' input.year', str(value.year)) else: control = self.find(selector) try: control.clear() except selexc.InvalidElementStateException: # On doit tenter de nettoyer un input[type=file]. On zap. pass control.send_keys(value) return self def select(self, selector, value): Select(self.find(selector)).select_by_value(value) return self def submit(self, selector='#document button[type=submit]'): return self.click(selector) def close(self): self.driver.close() if self.driver.window_handles: self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.set_window_size(1120, 550) return self def screenshot(self, filename): self.driver.get_screenshot_as_file(filename) sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,)) return self def save(self, filename): with open(filename, 'w') as fo: fo.write(self.driver.page_source) sys.stderr.write("HTML enregistré dans %r\n" % (filename,)) return self def __getattr__(self, name): return getattr(self.driver, name)
def __init__(self): self.browser = PhantomJS()
def __init__(self, browser: webdriver.PhantomJS, click_to_display_id: str): self._browser = browser self.click_to_display_id = click_to_display_id self.click_to_display_element = browser.find_element_by_id(click_to_display_id)