def startBrowser(headlessness=True): fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.headless = headlessness browser = webdriver.Firefox(options=fireFoxOptions) return browser
from selenium import webdriver import time, re print('输入QQ账号:', end='') username = str(input()) #QQ账号 print('输入登录密码:', end='') password = str(input()) #QQ密码 print('代码执行中,请稍后') opt = webdriver.FirefoxOptions() # 调用火狐 opt.add_argument('--headless') #后台启动火狐 browser = webdriver.Firefox( options=opt, executable_path="C:\爬虫驱动\geckodriver.exe") # 加载驱动 && 创建Firefox无界面对象 # browser = webdriver.Chrome(executable_path="C:\爬虫驱动\chromedriver.exe")#谷歌浏览器 browser.implicitly_wait(2) browser.get('https://qzone.qq.com/') browser.switch_to.frame('login_frame') browser.find_element_by_css_selector("#switcher_plogin").click() browser.find_element_by_css_selector("#u").send_keys(username) browser.find_element_by_css_selector("#p").send_keys(password) browser.find_element_by_css_selector("#login_button").click() cookies = {} time.sleep(3) browser.get_cookies() print(len(browser.get_cookies())) for i in browser.get_cookies(): cookies[i.get('name')] = i.get('value') print(cookies) browser.switch_to.frame(None) qzonetoken = re.findall(
def get_local_driver( browser_name, headless, servername, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_sync, user_data_dir, extension_zip, extension_dir, mobile_emulator, device_width, device_height, device_pixel_ratio): ''' Spins up a new web browser and returns the driver. Can also be used to spin up additional browsers for the same test. ''' downloads_path = download_helper.get_downloads_folder() download_helper.reset_downloads_folder() if browser_name == constants.Browser.FIREFOX: try: try: # Use Geckodriver for Firefox if it's on the PATH profile = _create_firefox_profile( downloads_path, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_capabilities['marionette'] = True options = webdriver.FirefoxOptions() if headless: options.add_argument('-headless') if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER): make_driver_executable_if_not(LOCAL_GECKODRIVER) elif not is_geckodriver_on_path(): if not "".join(sys.argv) == "-c": # Skip if multithreaded from seleniumbase.console_scripts import sb_install sys_args = sys.argv # Save a copy of current sys args print("\nWarning: geckodriver not found." " Installing now:") sb_install.main(override="geckodriver") sys.argv = sys_args # Put back the original sys args firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities, options=options) except WebDriverException: # Don't use Geckodriver: Only works for old versions of Firefox profile = _create_firefox_profile( downloads_path, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_capabilities['marionette'] = False firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities) return firefox_driver except Exception as e: if headless: raise Exception(e) return webdriver.Firefox() elif browser_name == constants.Browser.INTERNET_EXPLORER: if not IS_WINDOWS: raise Exception( "IE Browser is for Windows-based operating systems only!") from selenium.webdriver.ie.options import Options ie_options = Options() ie_options.ignore_protected_mode_settings = False ie_options.ignore_zoom_level = True ie_options.require_window_focus = False ie_options.native_events = True ie_options.full_page_screenshot = True ie_options.persistent_hover = True ie_capabilities = ie_options.to_capabilities() if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER): make_driver_executable_if_not(LOCAL_IEDRIVER) return webdriver.Ie(capabilities=ie_capabilities) elif browser_name == constants.Browser.EDGE: if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER): make_driver_executable_if_not(LOCAL_EDGEDRIVER) # The new Microsoft Edge browser is based on Chromium chrome_options = _set_chrome_options( downloads_path, headless, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_sync, user_data_dir, extension_zip, extension_dir, servername, mobile_emulator, device_width, device_height, device_pixel_ratio) return webdriver.Chrome(executable_path=LOCAL_EDGEDRIVER, options=chrome_options) else: return webdriver.Edge() elif browser_name == constants.Browser.SAFARI: if "".join(sys.argv) == "-c": # Skip if multithreaded raise Exception("Can't run Safari tests in multi-threaded mode!") return webdriver.Safari() elif browser_name == constants.Browser.OPERA: if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER): make_driver_executable_if_not(LOCAL_OPERADRIVER) return webdriver.Opera() elif browser_name == constants.Browser.PHANTOM_JS: with warnings.catch_warnings(): # Ignore "PhantomJS has been deprecated" UserWarning warnings.simplefilter("ignore", category=UserWarning) return webdriver.PhantomJS() elif browser_name == constants.Browser.GOOGLE_CHROME: try: chrome_options = _set_chrome_options( downloads_path, headless, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_sync, user_data_dir, extension_zip, extension_dir, servername, mobile_emulator, device_width, device_height, device_pixel_ratio) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): make_driver_executable_if_not(LOCAL_CHROMEDRIVER) elif not is_chromedriver_on_path(): if not "".join(sys.argv) == "-c": # Skip if multithreaded from seleniumbase.console_scripts import sb_install sys_args = sys.argv # Save a copy of current sys args print("\nWarning: chromedriver not found. Installing now:") sb_install.main(override="chromedriver") sys.argv = sys_args # Put back the original sys args return webdriver.Chrome(options=chrome_options) except Exception as e: if headless: raise Exception(e) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): make_driver_executable_if_not(LOCAL_CHROMEDRIVER) return webdriver.Chrome() else: raise Exception( "%s is not a valid browser option for this system!" % browser_name)
def main(): """ Test for Yahoo """ """ url = "https://login.yahoo.com" testuser = "******" options = webdriver.FirefoxOptions() options.set_headless(True) driver = webdriver.Firefox(options=options) driver.get(url) try: #element = WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete') with wait_for_page_load(driver): driver.find_element_by_id("login-username").send_keys(testuser) driver.find_element_by_id("login-signin").click() except Exception, e: sys.stderr.write("Error: " + repr(e) + "\n") try: errortest = driver.find_element_by_id("username-error") if errortest != None: print "%s doesn't exist" % testuser except NoSuchElementException: print "%s exists" %testuser driver.close() """ """ Test for Lieferando """ url = "https://www.lieferando.de" testuser = "******" surname = "Maik Kunze" testpassword = "******" options = webdriver.FirefoxOptions() options.set_headless(True) driver = webdriver.Firefox(options=options) driver.get(url) """ try a new Lieferando registration - if account already exists, website gives error message """ try: driver.find_element_by_xpath("//button[@class='menu button-myaccount userlogin']").click() wait = WebDriverWait(driver, 10) createbutton = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@data-click='register']"))) createbutton.click() userfield = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountuser']"))) userfield.send_keys(testuser) surnamefield = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountsurname']"))) surnamefield.send_keys(surname) pass1field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountpass']"))) pass1field.send_keys(testpassword) pass2field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountpass2']"))) pass2field.send_keys(testpassword) checkagb = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='legal']/label[@class='checkbox-inline']"))) checkagb.click() registerbutton = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='registerbutton']"))) registerbutton.click() # worst case but no other possibility because it comes only html back # for div "userpanel-wrapper", so 3 seconds should be enough to get # a AJAX-response time.sleep(3) #driver.implicitly_wait(3) """ interpret the result: if there exists already an account, then exist the following: <div id='userpanel-wrapper'> <div id='notification'> ... </div> <form id='iaccountsignupform'> </form> </div> if the registration was successful, the div 'userpanel-wrapper' doesn't contain the div 'notification' and the form 'iaccountsignupform', so testing for existence div and form mentioned shows the email-address is already registered or not """ try: notification = driver.find_element_by_xpath("//div[@id='notification']") signupform = driver.find_element_by_xpath("//form[@id='iaccountsignupform']") print "Lieferando %s exists" % testuser driver.get_screenshot_as_file('/tmp/test.png') except NoSuchElementException, e: print "Lieferando no account %s " % testuser except NoSuchElementException, e: print "Error: %s" % e
def firefox_driver(): options = webdriver.FirefoxOptions() options.add_argument('headless') return webdriver.Firefox(options=options)
def set_firefox_options(self): firefox_options = webdriver.FirefoxOptions() firefox_options.add_argument("--headless") return firefox_options
def init_firefox_driver(self, headless): options = webdriver.FirefoxOptions() if headless: options.add_argument('--headless') self.driver = webdriver.Firefox(firefox_options=options)
# ========================================================================= # 获取币种代码 # ========================================================================= # 在工作簿末尾插入新表 wsCodes = wbFX.create_sheet("Codes") # 顶端标题行 # append 方法将从最后一个被修改过的行(包括写入数据、设置单元格格式、设置行高列宽等) # 的下一行的最左侧单元格开始,依次将列表中的数据写入该行的各个单元格。 # 每调用一次 append 方法,按上述规则在新的一行写入数据。 wsCodes.append(["币种", "代码"]) # 浏览器无头模式(即不显示浏览器窗口) profile = webdriver.FirefoxOptions() profile.add_argument("-headless") browser = webdriver.Firefox(options=profile) # 访问页面 # 资料来源:站长之家。 browser.get("http://www.webmasterhome.cn/huilv/huobidaima.asp") # 隐式等待元素加载: # 指定时间内元素未加载完毕,则不再等待,代码继续执行; # 指定时间内元素加载完毕,在元素加载完毕后代码继续执行。 browser.implicitly_wait(5) # 访问页面并等待页面元素加载完毕后,即可进行网页元素的查找定位。 # 所有币种信息对应元素 elemCurrenciesList = browser.find_elements_by_tag_name("tr") print("") print("获取币种代码...")
class ObjectPage(object): browser = ReadConfig().getValue(section='browserType', name='browserName') chrome_driver_path = ReadConfig().getValue(section='located', name='chromedriverpath') chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('disable-infobars') chrome_options.add_argument("headless") chrome_options.add_argument( 'profile.managed_default_content_settings.images') chrome_options.add_argument('lang=zh_CN.UTF-8') chrome_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"' ) firefox_driver_path = ReadConfig().getValue(section='located', name='firefoxdriverpath') firefox_log_path = ReadConfig().getValue(section='located', name='firefox_log') firefox_options = webdriver.FirefoxOptions() firefox_options.add_argument('--log error') firefox_options.add_argument('--headless') firefox_options.add_argument('--disable-gpu') # 禁用GPU加速 def __init__(self): driver = self.getBrowsers() try: log1.info("-------------------- test start --------------------") self.driver = driver log1.info("Load Web Driver Success") except Exception: log1.error("Load Web Driver Fail", exc_info=1) self.getImage("Load Web Driver Fail") def getBrowsers(self): if self.browser == "Chrome": self.driver = webdriver.Chrome( executable_path=self.chrome_driver_path, chrome_options=self.chrome_options) elif self.browser == "Firefox": self.driver = webdriver.Firefox( executable_path=self.firefox_driver_path, firefox_options=self.firefox_options, log_path=self.firefox_log_path) return self.driver def getUrl(self, url): self.driver.get(url) log1.info("Set Url is: " + url) def hideWait(self, times): self.driver.implicitly_wait(times) log1.info("Set Implicitly Wait: " + str(times)) def maximizeWindow(self): self.driver.maximize_window() log1.info("Set Browser Max") def clearCookies(self): self.driver.delete_all_cookies() log1.info("Clear All Cookies") def refreshBrowser(self): self.driver.refresh() log1.info("Browser Refresh") def getCurrentUrl(self): url = self.driver.current_url() log1.info("Get Browser Url, The Url is: " + url) return url @staticmethod def isDisplayed(element): is_display = element.is_displayed() log1.info("Element displayed is: " + is_display) return is_display @staticmethod def sleepWait(times): time.sleep(times) log1.info("Set Sleep Time is: " + str(times)) @staticmethod def isSelect(element): log1.info("Element is Select") return element.is_selected() def findElement(self, by, value): by_map = { 'id': By.ID, 'name': By.NAME, 'class': By.CLASS_NAME, 'tag': By.TAG_NAME, 'link': By.LINK_TEXT, 'plink': By.PARTIAL_LINK_TEXT, 'css': By.CSS_SELECTOR, 'xpath': By.XPATH } if by in by_map.keys(): try: element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( EC.presence_of_element_located( (by_map[by], value))) log1.info(by + " Query Element: " + value) return element except NoSuchElementException: log1.error("Not Found Element Or Timeout", exc_info=1) self.getImage("Not Found Element Or Timeout") else: log1.error(by + " Variable Error", exc_info=1) self.getImage(by + " Variable Error") # element = None # if by in ['id', 'name', 'class', 'tag', 'link', 'plink', 'css', 'xpath']: # try: # if by == 'id': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.ID, value))) # log1.info("Id Query Element: "+value) # elif by == 'name': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.NAME, value))) # log1.info("Name Query Element: "+value) # elif by == 'class': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.CLASS_NAME, value))) # log1.info("Class Name Query Element: "+value) # elif by == 'tag': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.TAG_NAME, value))) # log1.info("Tag Name Query Element: "+value) # elif by == 'link': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.LINK_TEXT, value))) # log1.info("Link Query Element: "+value) # elif by == 'plink': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, value))) # log1.info("Partial Link Query Element: "+value) # elif by == 'css': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.CSS_SELECTOR, value))) # log1.info("Css Query Element: "+value) # elif by == 'xpath': # element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until( # EC.presence_of_element_located((By.XPATH, value))) # log1.info("Xpath Query Element: "+value) # return element # except NoSuchElementException: # log1.error("Not Found Element Or Timeout", exc_info=1) # self.getImage("Not Found Element Or Timeout") # # else: # log1.error("Variable Error", exc_info=1) # self.getImage("Variable Error") def findElements(self, by, value): by_map = { 'id': By.ID, 'name': By.NAME, 'class': By.CLASS_NAME, 'tag': By.TAG_NAME, 'link': By.LINK_TEXT, 'plink': By.PARTIAL_LINK_TEXT, 'css': By.CSS_SELECTOR, 'xpath': By.XPATH } if by in by_map.keys(): try: elements = WebDriverWait( self.driver, 10, ignored_exceptions=None).until( EC.presence_of_all_elements_located( (by_map[by], value))) log1.info(by + " Query Element: " + value) return elements except NoSuchElementException: log1.error("Not Found Element Or Timeout", exc_info=1) self.getImage("Not Found Element Or Timeout") else: log1.error(by + " Variable Error", exc_info=1) self.getImage(by + " Variable Error") # element = None # if by in ['id', 'name', 'class', 'tag', 'link', 'plink', 'css', 'xpath']: # try: # if by == 'id': # element = self.driver.find_elements_by_id(value) # log1.info("Id Query Elements "+value) # elif by == 'name': # element = self.driver.find_elements_by_name(value) # log1.info("Name Query Elements "+value) # elif by == 'class': # element = self.driver.find_elements_by_class_name(value) # log1.info("Class Name Query Elements "+value) # elif by == 'tag': # element = self.driver.find_elements_by_tag_name(value) # log1.info("Tag Name Query Elements "+value) # elif by == 'link': # element = self.driver.find_elements_by_link_text(value) # log1.info("Link Query Elements "+value) # elif by == 'plink': # element = self.driver.find_elements_by_partial_link_text(value) # log1.info("Partial Link Query Elements "+value) # elif by == 'css': # element = self.driver.find_elements_by_css_selector(value) # log1.info("Css Query Elements "+value) # elif by == 'xpath': # element = self.driver.find_elements_by_xpath(value) # log1.info("Xpath Query Elements "+value) # log1.info("Found Element") # return element # except NoSuchElementException: # log1.error("Not Found Element Or Timeout", exc_info=1) # self.getImage("Not Found Element Or Timeout") # else: # log1.error("Variable Error", exc_info=1) # self.getImage("Variable Error") @staticmethod def sendKeys(self, element, text): element.clear() log1.info("Element Clear Text") try: element.send_keys(text) log1.info("Element Input Text: " + text) except BaseException: log1.error("Not Found Element Or Input Error", exc_info=1) self.getImage("Not Found Element Or Input Error") def click(self, element): try: element.click() log1.info("Element Click") except BaseException: if self.isDisplayed(element) is True: self.sleepWait(3) element.click() log1.info("Element Click") else: log1.error('Not Found Element', exc_info=1) def getTitle(self): log1.info("Get Title") return self.driver.title def actionsKeyDown(self): ActionChains(self.driver).key_down(Keys.CONTROL).perform() def actionsKeyUp(self): ActionChains(self.driver).key_up(Keys.CONTROL).perform() @staticmethod def select(type, element, value): try: if type == "index": Select(element).select_by_index(value) log1.info("Select Element Index") elif type == "value": Select(element).select_by_value(value) log1.info("Select Element Value") elif type == "text": Select(element).select_by_visible_text(value) log1.info("Select Element text") else: log1.error('please input type', exc_info=1) except BaseException: log1.error('Not Found Element', exc_info=1) @staticmethod def deselect(type, element, value=""): try: if type == "index" and value != "": Select(element).deselect_by_index(value) log1.info("Deselect Element Index") elif type == "value" and value != "": Select(element).deselect_by_value(value) log1.info("Deselect Element Value") elif type == "text" and value != "": Select(element).deselect_by_visible_text(value) log1.info("Deselect Element Text") elif type == "all" and value == "": Select(element).deselect_all() log1.info("Deselect All Element") else: log1.error('please input type', exc_info=1) except BaseException: log1.error("Not Found Select") @staticmethod def getAllSelect(element): try: log1.info("Get All Select") return Select(element).all_selected_options except BaseException: log1.error("Not Found All Select") @staticmethod def getAttribute(element, attribute): log1.info("Get Element Attribute") return element.get_attribute(attribute) @staticmethod def getText(element): log1.info("Get Element Text") return element.text def getImage(self, imageName): img = ReadConfig().getValue(section='located', name='image') try: self.driver.get_screenshot_as_file(img + imageName + ".png") log1.info("Screenshot Image") except BaseException: log1.error("Screenshot Image Fail", exc_info=1) def textAlert(self): t = str(self.driver.switch_to.alert.text) log1.info("Get Alert Text: " + t) self.acceptAlert() return t def sendKeysAlert(self, text): self.driver.switch_to.alert.send_keys(text) log1.info("Input Alert Text: " + text) self.acceptAlert() def acceptAlert(self): self.driver.switch_to.alert.accept() log1.info("Alert Accept") def dismissAlert(self): self.driver.switch_to.alert.dismiss() log1.info("Alert Dismiss") def quitBrowser(self): self.sleepWait(3) self.driver.quit() log1.info("Quit Browser") log1.info("-------------------- test end --------------------") def closeBrowser(self): self.sleepWait(3) self.driver.close() log1.info("Close Browser") log1.info("-------------------- test end --------------------")
def ready(proxy_ip_port=None): ''' :param proxy_ip_port: :return: ''' if system == 'windows': executable_path = 'D:/geckodriver/geckodriver.exe' if driver_type == 'firefox' else 'D:/geckodriver/chromedriver.exe' elif system == 'linux': executable_path = '../temp_file/geckodriver' if driver_type == 'firefox' else '../temp_file/chromedriver' else: logger.warning('没有这个系统:%s,暂时默认为windows' % system) executable_path = 'D:/geckodriver/geckodriver.exe' if driver_type == 'firefox' else 'D:/geckodriver/chromedriver.exe' if driver_type == 'firefox': # 如果用火狐 fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() if proxy_ip_port: profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) if proxy_ip_port['proxy_type'] == 'http': profile.set_preference( "network.proxy.http", proxy_ip_port['ip_with_port'].split(':')[0]) profile.set_preference( "network.proxy.http_port", int(proxy_ip_port['ip_with_port'].split(':')[1])) else: profile.set_preference( 'network.proxy.socks', proxy_ip_port['ip_with_port'].split(':')[0]) profile.set_preference( 'network.proxy.socks_port', int(proxy_ip_port['ip_with_port'].split(':')[1])) profile.update_preferences() driver = webdriver.Firefox( executable_path=executable_path, firefox_profile=profile, options=None if mode == 'debug' else fireFoxOptions) # webdriver.Chrome() else: driver = webdriver.Firefox( executable_path=executable_path, options=None if mode == 'debug' else fireFoxOptions) # if proxy_ip_port: # if proxy_ip_port['proxy_type'] == 'http': # proxy = Proxy({'prox' # 'yType': ProxyType.MANUAL['string'], 'httpProxy': proxy_ip_port['ip_with_port']}) # elif proxy_ip_port['proxy_type'] == 'socks5' or proxy_ip_port['proxy_type'] == 'socks4': # proxy = Proxy({'proxyType': ProxyType.MANUAL['string'], 'socksProxy': proxy_ip_port['ip_with_port']}) # else: # raise Exception('Has no this kind of proxy:' + proxy_ip_port['proxy_type']) # else: # proxy = None # if mode == 'product': # driver = webdriver.Firefox(executable_path=executable_path, options=fireFoxOptions, # proxy=proxy) # elif mode == 'debug': # if system == 'windows': # binary_path = 'C:/Program Files (x86)/Mozilla Firefox/firefox.exe' # else: # binary_path = '/usr/bin/firefox' # binary = FirefoxBinary(binary_path) # driver = webdriver.Firefox(firefox_binary=binary,executable_path=executable_path, proxy=proxy) # else: # raise Exception('ready方法的参数里面没有这种mode:%s' % mode) # PROXY = proxy_ip_port['ip_with_port'] # webdriver.DesiredCapabilities.FIREFOX['proxy'] = { # # "httpProxy": PROXY, # # "ftpProxy": PROXY, # # "sslProxy": PROXY, # "socksProxy": PROXY, # "noProxy": None, # "proxyType": "MANUAL", # "class": "org.openqa.selenium.Proxy", # "autodetect": False # } # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.FIREFOX) else: # 不是火狐就是chrome chrome_options = webdriver.ChromeOptions() if not mode == 'debug': chrome_options.set_headless() if proxy_ip_port: chrome_options.add_argument( '--proxy-server=%s://%s' % (proxy_ip_port['proxy_type'], proxy_ip_port['ip_with_port'])) driver = webdriver.Chrome( executable_path='D:/geckodriver/chromedriver.exe', chrome_options=chrome_options) driver.implicitly_wait( 10) # 设置寻找(包括异步加载的)element所等待的时间(如果不设置,则异步加载的element有可能会找不到) driver.set_page_load_timeout(10) # 设置timeout时间 return driver
from selenium import webdriver from selenium.webdriver.common.touch_actions import TouchActions import time import openpyxl wx_rul = "https://testmy.orangebank.com.cn/WeiXin/773A50B48B2286170DE9A2D3FF014EAF92D09B8C815C0E0C51741AAA2FD67D" \ "1B96C10BD04CA2737C73F42DC5CAE1AC6383D4217E902E8DC6C0A7A92A3767427EF47ADAB1B02F2BA828AACA271C56C0E10D12" \ "D927DE2DDF5252C27C145BF42C38A810BD7CFDC2DAB28005E17A459EA7FF2363ADB5C01BEB85C7CE809B977013162D54FA5826" \ "A647966ADDF84F3E99B6195E18E2CE916137A91B1C1820539D9D60354C504A15D997C3386D19641DD832DB95715AFF7EA0E8B0" \ "D77C9625EF6AE9193D42D869233E79BFC031DE443B6B8E717C8CD9B5159F19DF8544AB7EE3F13FCF_bespeakService_1_1.do" # 给浏览器定义启动参数 firefox_start_options = webdriver.FirefoxOptions() firefox_start_options.add_argument("mobileEmulation") # 配置`desired_capabilities` desired_caps = {} desired_caps['platform'] = 'WINDOWS' desired_caps['browserName'] = 'firefox' desired_caps['deviceName'] = 'iPhone 6' # 根据上面的配置实例化一个Remote类 remote_firefox = webdriver.Remote('http://192.168.0.122:4444/wd/hub', desired_caps, options=firefox_start_options) remote_firefox.maximize_window() remote_firefox.get("https://www.baidu.com")
def test_Beyonic_file(self): driver_plain = webdriver.Firefox(executable_path="d:\\geckodriver.exe") driver = EventFiringWebDriver(driver_plain, MyListener()) driver.get("https://apidocs.beyonic.com/") options = webdriver.FirefoxOptions() options.add_argument('-headless') driver.maximize_window() sleep(12) action = ActionChains(driver) driver.find_element(By.ID, "input-search").click() action.send_keys("Listing all") action.key_down(Keys.ENTER).key_up(Keys.ENTER).perform() action.key_down(Keys.ENTER).key_up(Keys.ENTER).perform() driver.find_element(By.LINK_TEXT, "Listing all Payments").click() element1 = driver.find_element_by_xpath("//*[@href='#listing-all-payments']").text print("This will return a", element1) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all Contacts").click() element2 = driver.find_element_by_xpath("//*[@href='#listing-all-contacts']").text print("This will return a", element2) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all transactions").click() element3 = driver.find_element_by_xpath("//*[@href='#listing-all-transactions']").text print("This will return a", element3) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all Events").click() element4 = driver.find_element_by_xpath("//*[@href='#listing-all-events']").text print("This will return a", element4) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all Collections").click() element5 = driver.find_element_by_xpath("//*[@href='#listing-all-collections']").text print("This will return a", element5) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all webhooks").click() element6 = driver.find_element_by_xpath("//*[@href='#listing-all-webhooks']").text print("This will return a", element6) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all networks").click() element7 = driver.find_element_by_xpath("//*[@href='#listing-all-networks']").text print("This will return a", element7) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all accounts").click() element8 = driver.find_element_by_xpath("//*[@href='#listing-all-accounts']").text print("This will return a", element8) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all currencies").click() element9 = driver.find_element_by_xpath("//*[@href='#listing-all-currencies']").text print("This will return a", element9) sleep(2) driver.find_element(By.LINK_TEXT, "Listing all Collection Requests").click() element10 = driver.find_element_by_xpath("//*[@href='#listing-all-collection-requests']").text print("This will return a", element10) sleep(2) driver.find_element(By.LINK_TEXT, "Collection Requests").click() element101 = driver.find_element_by_xpath("//aside[8]").text print(element101) sleep(2) driver.find_element(By.LINK_TEXT, "Collections").click() element102 = driver.find_element_by_xpath("//aside[9]").text print(element102) sleep(2) driver.find_element(By.LINK_TEXT, "Payments").click() element103 = driver.find_element_by_xpath("//aside[10]").text print(element103) sleep(2) driver.find_element(By.LINK_TEXT, "Currencies").click() element104 = driver.find_element_by_xpath("//aside[11]").text print(element104) sleep(2) driver.find_element(By.LINK_TEXT, "Networks").click() element105 = driver.find_element_by_xpath("//aside[12]").text print(element105) sleep(2) driver.find_element(By.LINK_TEXT, "Banks").click() element106 = driver.find_element_by_xpath("//aside[13]").text print(element106) sleep(2) driver.find_element(By.LINK_TEXT, "Accounts").click() element107 = driver.find_element_by_xpath("//aside[14]").text print(element107) sleep(2) driver.find_element(By.LINK_TEXT, "Transactions").click() element108 = driver.find_element_by_xpath("//aside[15]").text print(element108) sleep(2) driver.find_element(By.LINK_TEXT, "Contacts").click() element109 = driver.find_element_by_xpath("//aside[16]").text print(element109) sleep(2) driver.find_element(By.LINK_TEXT, "Events").click() element1010 = driver.find_element_by_xpath("//aside[17]").text print(element1010) sleep(2) driver.find_element(By.LINK_TEXT, "Webhooks").click() element1011 = driver.find_element_by_xpath("//aside[18]").text print(element1011) sleep(2) test_output = open("D:/test.txt", "w+") test_output.write(element101 + '\n') test_output.write(element102 + '\n') test_output.write(element103 + '\n') test_output.write(element104 + '\n') test_output.write(element105 + '\n') test_output.write(element106 + '\n') test_output.write(element107 + '\n') test_output.write(element108 + '\n') test_output.write(element109 + '\n') test_output.write(element1010 + '\n') test_output.write(element1011 + '\n') test_output.close() f1 = open("D:/test.txt", "r") f2 = open("D:/endpoints.txt", "r") f3 = open("D:/undocumented_endpoints.txt", "w") file1_raw = f1.read() file2_raw = f2.read() file1_words = file1_raw.split() file2_words = file2_raw.split() result1 = set(file1_words).difference(file2_words) result2 = set(file2_words).difference(file1_words) results = result1.union(result2) for endpoints in set(results): f3.write(endpoints + "\n") f1.close() f2.close() f3.close() driver.quit()
def __init__(self, *args, **kwargs): """Starts a new local session of Firefox. Based on the combination and specificity of the various keyword arguments, a capabilities dictionary will be constructed that is passed to the remote end. The keyword arguments given to this constructor are helpers to more easily allow Firefox WebDriver sessions to be customised with different options. They are mapped on to a capabilities dictionary that is passed on to the remote end. As some of the options, such as `firefox_profile` and `options.profile` are mutually exclusive, precedence is given from how specific the setting is. `capabilities` is the least specific keyword argument, followed by `options`, followed by `firefox_binary` and `firefox_profile`. In practice this means that if `firefox_profile` and `options.profile` are both set, the selected profile instance will always come from the most specific variable. In this case that would be `firefox_profile`. This will result in `options.profile` to be ignored because it is considered a less specific setting than the top-level `firefox_profile` keyword argument. Similarly, if you had specified a `capabilities["moz:firefoxOptions"]["profile"]` Base64 string, this would rank below `options.profile`. - option_args - [List] of arguments to be added to options (options.add_argument(x)) - log_level - [Str] Selenium and urllib3 log level (CRITICAL / FATAL / ERROR / WARNING / INFO / DEBUG / NOTSET) - launch_attempts - [int] Permitted number of attempts to launch the selenium web driver ----------- - firefox_profile - Instance of ``FirefoxProfile`` object or a string. If undefined, a fresh profile will be created in a temporary location on the system. - firefox_binary - Instance of ``FirefoxBinary`` or full path to the Firefox binary. If undefined, the system default Firefox installation will be used. - timeout - Time to wait for Firefox to launch when using the extension connection. - capabilities - Dictionary of desired capabilities. - proxy - The proxy settings to us when communicating with Firefox via the extension connection. - executable_path - Full path to override which geckodriver binary to use for Firefox 47.0.1 and greater, which defaults to picking up the binary from the system path. - options - Instance of ``options.Options``. - service_log_path - Where to log information from the driver. - firefox_options - Deprecated argument for options - service_args - List of args to pass to the driver service - desired_capabilities - alias of capabilities. In future versions of this library, this will replace 'capabilities'. This will make the signature consistent with RemoteWebDriver. - log_path - Deprecated argument for service_log_path - keep_alive - Whether to configure remote_connection.RemoteConnection to use HTTP keep-alive. """ self.launch_attempts = kwargs.pop('launch_attempts', 2) self._platform_release = platform.release() if kwargs.get('log_level'): self.set_selenium_log_level(kwargs.pop('log_level', self.LOGGER_DEFAULT_LEVEL)) self.options = kwargs.pop('options', webdriver.FirefoxOptions()) for arg in kwargs.pop('option_args', []): self.options.add_argument(arg) logger.info('Launching Chrome webdriver') last_exception = None for i in range(self.launch_attempts): try: super().__init__( *args, **{'desired_capabilities' if self._platform_release == 'XP' else 'options': self.options}, **kwargs ) break except WebDriverException as ex: logger.error(ex) last_exception = ex if type(ex) == SessionNotCreatedException or "executable needs to be in PATH" in str(ex): self.update() except Exception as ex: logger.error(ex) last_exception = ex else: raise last_exception logger.info('Firefox webdriver launched successfully')
def get_local_driver(browser_name, headless, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_sync, user_data_dir, extension_zip, extension_dir): ''' Spins up a new web browser and returns the driver. Can also be used to spin up additional browsers for the same test. ''' downloads_path = download_helper.get_downloads_folder() download_helper.reset_downloads_folder() if browser_name == constants.Browser.FIREFOX: try: try: # Use Geckodriver for Firefox if it's on the PATH profile = _create_firefox_profile(downloads_path, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_capabilities['marionette'] = True options = webdriver.FirefoxOptions() if headless: options.add_argument('-headless') if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER): make_driver_executable_if_not(LOCAL_GECKODRIVER) firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities, options=options, executable_path=LOCAL_GECKODRIVER) else: firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities, options=options) except WebDriverException: # Don't use Geckodriver: Only works for old versions of Firefox profile = _create_firefox_profile(downloads_path, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_capabilities['marionette'] = False firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities) return firefox_driver except Exception as e: if headless: raise Exception(e) return webdriver.Firefox() elif browser_name == constants.Browser.INTERNET_EXPLORER: if not IS_WINDOWS: raise Exception( "IE Browser is for Windows-based operating systems only!") from selenium.webdriver.ie.options import Options ie_options = Options() ie_options.ignore_protected_mode_settings = False ie_options.ignore_zoom_level = True ie_options.require_window_focus = False ie_options.native_events = True ie_options.full_page_screenshot = True ie_options.persistent_hover = True ie_capabilities = ie_options.to_capabilities() if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER): make_driver_executable_if_not(LOCAL_IEDRIVER) return webdriver.Ie(capabilities=ie_capabilities, executable_path=LOCAL_IEDRIVER) else: return webdriver.Ie(capabilities=ie_capabilities) elif browser_name == constants.Browser.EDGE: if not IS_WINDOWS: raise Exception( "Edge Browser is for Windows-based operating systems only!") edge_capabilities = DesiredCapabilities.EDGE.copy() if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER): make_driver_executable_if_not(LOCAL_EDGEDRIVER) return webdriver.Edge(capabilities=edge_capabilities, executable_path=LOCAL_EDGEDRIVER) else: return webdriver.Edge(capabilities=edge_capabilities) elif browser_name == constants.Browser.SAFARI: return webdriver.Safari() elif browser_name == constants.Browser.OPERA: if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER): make_driver_executable_if_not(LOCAL_OPERADRIVER) return webdriver.Opera(executable_path=LOCAL_OPERADRIVER) else: return webdriver.Opera() elif browser_name == constants.Browser.PHANTOM_JS: with warnings.catch_warnings(): # Ignore "PhantomJS has been deprecated" UserWarning warnings.simplefilter("ignore", category=UserWarning) return webdriver.PhantomJS() elif browser_name == constants.Browser.GOOGLE_CHROME: try: chrome_options = _set_chrome_options(downloads_path, headless, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_sync, user_data_dir, extension_zip, extension_dir) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): make_driver_executable_if_not(LOCAL_CHROMEDRIVER) return webdriver.Chrome(executable_path=LOCAL_CHROMEDRIVER, options=chrome_options) else: return webdriver.Chrome(options=chrome_options) except Exception as e: if headless: raise Exception(e) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): make_driver_executable_if_not(LOCAL_CHROMEDRIVER) return webdriver.Chrome(executable_path=LOCAL_CHROMEDRIVER) else: return webdriver.Chrome() else: raise Exception("%s is not a valid browser option for this system!" % browser_name)
def get_local_driver(browser_name, headless, locale_code, servername, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox, disable_gpu, incognito, guest_mode, devtools, swiftshader, block_images, user_data_dir, extension_zip, extension_dir, mobile_emulator, device_width, device_height, device_pixel_ratio): ''' Spins up a new web browser and returns the driver. Can also be used to spin up additional browsers for the same test. ''' downloads_path = download_helper.get_downloads_folder() download_helper.reset_downloads_folder() if browser_name == constants.Browser.FIREFOX: try: try: # Use Geckodriver for Firefox if it's on the PATH profile = _create_firefox_profile(downloads_path, locale_code, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_capabilities['marionette'] = True options = webdriver.FirefoxOptions() if headless: options.add_argument('-headless') firefox_capabilities['moz:firefoxOptions'] = ({ 'args': ['-headless'] }) if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER): try: make_driver_executable_if_not(LOCAL_GECKODRIVER) except Exception as e: logging.debug("\nWarning: Could not make geckodriver" " executable: %s" % e) elif not is_geckodriver_on_path(): args = " ".join(sys.argv) if not ("-n" in sys.argv or "-n=" in args or args == "-c"): # (Not multithreaded) from seleniumbase.console_scripts import sb_install sys_args = sys.argv # Save a copy of current sys args print("\nWarning: geckodriver not found!" " Installing now:") try: sb_install.main(override="geckodriver") except Exception as e: print("\nWarning: Could not install geckodriver: " "%s" % e) sys.argv = sys_args # Put back the original sys args if "linux" in PLATFORM or not headless: firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities) else: firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities, options=options) except Exception: profile = _create_firefox_profile(downloads_path, locale_code, proxy_string, user_agent, disable_csp) firefox_capabilities = DesiredCapabilities.FIREFOX.copy() firefox_driver = webdriver.Firefox( firefox_profile=profile, capabilities=firefox_capabilities) return firefox_driver except Exception as e: if headless: raise Exception(e) return webdriver.Firefox() elif browser_name == constants.Browser.INTERNET_EXPLORER: if not IS_WINDOWS: raise Exception( "IE Browser is for Windows-based operating systems only!") from selenium.webdriver.ie.options import Options ie_options = Options() ie_options.ignore_protected_mode_settings = False ie_options.ignore_zoom_level = True ie_options.require_window_focus = False ie_options.native_events = True ie_options.full_page_screenshot = True ie_options.persistent_hover = True ie_capabilities = ie_options.to_capabilities() if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER): try: make_driver_executable_if_not(LOCAL_IEDRIVER) except Exception as e: logging.debug("\nWarning: Could not make iedriver" " executable: %s" % e) return webdriver.Ie(capabilities=ie_capabilities) elif browser_name == constants.Browser.EDGE: try: chrome_options = _set_chrome_options( browser_name, downloads_path, headless, locale_code, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox, disable_gpu, incognito, guest_mode, devtools, swiftshader, block_images, user_data_dir, extension_zip, extension_dir, servername, mobile_emulator, device_width, device_height, device_pixel_ratio) if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER): try: make_driver_executable_if_not(LOCAL_EDGEDRIVER) except Exception as e: logging.debug("\nWarning: Could not make edgedriver" " executable: %s" % e) elif not is_edgedriver_on_path(): args = " ".join(sys.argv) if not ("-n" in sys.argv or "-n=" in args or args == "-c"): # (Not multithreaded) from seleniumbase.console_scripts import sb_install sys_args = sys.argv # Save a copy of current sys args print("\nWarning: msedgedriver not found. Installing now:") sb_install.main(override="edgedriver") sys.argv = sys_args # Put back the original sys args # For Microsoft Edge (Chromium) version 79 or lower return webdriver.Chrome(executable_path=LOCAL_EDGEDRIVER, options=chrome_options) except Exception: # For Microsoft Edge (Chromium) version 80 or higher from msedge.selenium_tools import Edge, EdgeOptions if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER): try: make_driver_executable_if_not(LOCAL_EDGEDRIVER) except Exception as e: logging.debug("\nWarning: Could not make edgedriver" " executable: %s" % e) edge_options = EdgeOptions() edge_options.use_chromium = True prefs = { "download.default_directory": downloads_path, "local_discovery.notifications_enabled": False, "credentials_enable_service": False, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": False, "safebrowsing.disable_download_protection": True, "profile": { "password_manager_enabled": False, "default_content_setting_values.automatic_downloads": 1 } } if locale_code: prefs["intl.accept_languages"] = locale_code if block_images: prefs["profile.managed_default_content_settings.images"] = 2 edge_options.add_experimental_option("prefs", prefs) edge_options.add_experimental_option("w3c", True) edge_options.add_experimental_option("useAutomationExtension", False) edge_options.add_experimental_option( "excludeSwitches", ["enable-automation", "enable-logging"]) if guest_mode: edge_options.add_argument("--guest") if headless: edge_options.add_argument("--headless") if mobile_emulator: emulator_settings = {} device_metrics = {} if type(device_width) is int and ( type(device_height) is int and (type(device_pixel_ratio) is int)): device_metrics["width"] = device_width device_metrics["height"] = device_height device_metrics["pixelRatio"] = device_pixel_ratio else: device_metrics["width"] = 411 device_metrics["height"] = 731 device_metrics["pixelRatio"] = 3 emulator_settings["deviceMetrics"] = device_metrics if user_agent: emulator_settings["userAgent"] = user_agent edge_options.add_experimental_option("mobileEmulation", emulator_settings) edge_options.add_argument("--enable-sync") edge_options.add_argument("--disable-infobars") edge_options.add_argument("--disable-save-password-bubble") edge_options.add_argument("--disable-single-click-autofill") edge_options.add_argument( "--disable-autofill-keyboard-accessory-view[8]") edge_options.add_argument("--disable-translate") if not enable_ws: edge_options.add_argument("--disable-web-security") edge_options.add_argument("--homepage=about:blank") edge_options.add_argument("--dns-prefetch-disable") edge_options.add_argument("--dom-automation") edge_options.add_argument("--disable-hang-monitor") edge_options.add_argument("--disable-prompt-on-repost") if proxy_string: edge_options.add_argument('--proxy-server=%s' % proxy_string) edge_options.add_argument("--test-type") edge_options.add_argument("--log-level=3") edge_options.add_argument("--no-first-run") edge_options.add_argument("--ignore-certificate-errors") if devtools and not headless: edge_options.add_argument("--auto-open-devtools-for-tabs") edge_options.add_argument("--allow-file-access-from-files") edge_options.add_argument("--allow-insecure-localhost") edge_options.add_argument("--allow-running-insecure-content") if user_agent: edge_options.add_argument("--user-agent=%s" % user_agent) edge_options.add_argument("--no-sandbox") if swiftshader: edge_options.add_argument("--use-gl=swiftshader") else: edge_options.add_argument("--disable-gpu") if "linux" in PLATFORM: edge_options.add_argument("--disable-dev-shm-usage") capabilities = edge_options.to_capabilities() capabilities["platform"] = '' return Edge(executable_path=LOCAL_EDGEDRIVER, capabilities=capabilities) elif browser_name == constants.Browser.SAFARI: arg_join = " ".join(sys.argv) if ("-n" in sys.argv) or ("-n=" in arg_join) or (arg_join == "-c"): # Skip if multithreaded raise Exception("Can't run Safari tests in multi-threaded mode!") safari_capabilities = _set_safari_capabilities() return webdriver.Safari(desired_capabilities=safari_capabilities) elif browser_name == constants.Browser.OPERA: try: if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER): try: make_driver_executable_if_not(LOCAL_OPERADRIVER) except Exception as e: logging.debug("\nWarning: Could not make operadriver" " executable: %s" % e) opera_options = _set_chrome_options( browser_name, downloads_path, headless, locale_code, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox, disable_gpu, incognito, guest_mode, devtools, swiftshader, block_images, user_data_dir, extension_zip, extension_dir, servername, mobile_emulator, device_width, device_height, device_pixel_ratio) opera_options.headless = False # No support for headless Opera return webdriver.Opera(options=opera_options) except Exception: return webdriver.Opera() elif browser_name == constants.Browser.PHANTOM_JS: with warnings.catch_warnings(): # Ignore "PhantomJS has been deprecated" UserWarning warnings.simplefilter("ignore", category=UserWarning) return webdriver.PhantomJS() elif browser_name == constants.Browser.GOOGLE_CHROME: try: chrome_options = _set_chrome_options( browser_name, downloads_path, headless, locale_code, proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent, disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox, disable_gpu, incognito, guest_mode, devtools, swiftshader, block_images, user_data_dir, extension_zip, extension_dir, servername, mobile_emulator, device_width, device_height, device_pixel_ratio) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): try: make_driver_executable_if_not(LOCAL_CHROMEDRIVER) except Exception as e: logging.debug("\nWarning: Could not make chromedriver" " executable: %s" % e) elif not is_chromedriver_on_path(): args = " ".join(sys.argv) if not ("-n" in sys.argv or "-n=" in args or args == "-c"): # (Not multithreaded) from seleniumbase.console_scripts import sb_install sys_args = sys.argv # Save a copy of current sys args print("\nWarning: chromedriver not found. Installing now:") sb_install.main(override="chromedriver") sys.argv = sys_args # Put back the original sys args if not headless or "linux" not in PLATFORM: return webdriver.Chrome(options=chrome_options) else: # Running headless on Linux try: return webdriver.Chrome(options=chrome_options) except Exception: # Use the virtual display on Linux during headless errors logging.debug("\nWarning: Chrome failed to launch in" " headless mode. Attempting to use the" " SeleniumBase virtual display on Linux...") chrome_options.headless = False return webdriver.Chrome(options=chrome_options) except Exception as e: if headless: raise Exception(e) if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER): try: make_driver_executable_if_not(LOCAL_CHROMEDRIVER) except Exception as e: logging.debug("\nWarning: Could not make chromedriver" " executable: %s" % e) return webdriver.Chrome() else: raise Exception("%s is not a valid browser option for this system!" % browser_name)
def get_data_for_stock(stock, verbose=False): # get_data_for_stock() # Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory. # Also, a folder called "processed" contains more folders whose names are of the # stocks that have already been processed. This is for making it easy to just run the # script again if it is stopped (e.g. your PC crashes, you have to kill the script) # Input: stock (str) - ticker symbol of a designated stock # Output: None # Set the Firefox webdriver to run headless in the background fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() if massive_scrape_mode: if stock.replace('.', '_') not in os.listdir('processed'): os.mkdir('processed/{}'.format(stock.replace( '.', '_'))) # Instantiate a folder whose name is {stock} # to mark that this stock has already been processed driver = webdriver.Firefox( options=fireFoxOptions ) # Initialize the webdriver instance in the background try: # Search Reuters for {stock} driver.get('https://www.reuters.com/search/news?blob={}'.format(stock)) time.sleep(2) # Wait for the page to load # Reuters should query the company if they have written articles on it. # If they haven't, an error will be thrown and no files will be outputted # This line gets the company they queried's element which contains # the stock's name and URL to Reuters's page on the stock text = driver.find_element_by_xpath( '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a' ).text # {condition} will determine if the stock queried is actually # the stock that we're trying to get articles on condition = False # Reuters will format the element's text in 2 ways: # {company name} ({ticker}.{some additional text}) # e.g. # Apple Inc (AAPL.OQ) # or # {company name} ({ticker}) # e.g. # Alcoa Corp (AA) # The following lines of code will filter down the element's text # to just get {ticker} # e.g. # Apple Inc (AAPL.OQ) --> AAPL # Aloca Corp (AA) --> AA if '.' in text: # Check if a period is in the text # Check if {ticker}.upper() == {stock}.upper() if text[text.find('(') + 1:text.find('.')].upper() == stock.upper(): condition = True # {condition} = True means that the queried stock is a match else: # If there is no period # Check if {ticker}.upper() == {stock}.upper() if text[text.find('(') + 1:text.find(')')].upper() == stock.upper(): condition = True # {condition} = True means that the queried stock is a match if condition: # If {stock} has been found in Reuters, continue if verbose: print('Stock was found on Reuters.') # Click the element's link, going to Reuters's driver.find_element_by_xpath( '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a' ).click() time.sleep(0.5) # Let the stock's Reuters page load # Go to the "News" section of the stock's Reuters page driver.find_element_by_xpath( '/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button' ).click() time.sleep(5) # The next segment will scroll down to the bottom of the "News" # page of the stock's Reuters page SCROLL_PAUSE_TIME = 0.5 # This is how much time it waits before scrolling to # the bottom of the page again # Get the last height of the page last_height = driver.execute_script( "return document.body.scrollHeight") it_num = 0 if verbose: print('Scrolling to the bottom of the news page...') while True: if verbose: if it_num % 10 == 0: print('{} - Scroll: Iteration #{}'.format( stock, it_num)) it_num += 1 # Scroll to the bottom of the page driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait for more content to load time.sleep(SCROLL_PAUSE_TIME) # Get the current height of the page new_height = driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: # If the current height of the page is the same as it was before, # break the script because there is no more content to load. break last_height = new_height # Get the previous height of the page if verbose: print('Scroll completed.') i = 1 # Article index number starts at one in the HTML datas = [] # Put all of the data in here tol = 0 # Tol is the amount of times an error has been thrown (for this stock news page). # When it hits 3, it's confirmed that all articles for this stock # have been queried. # The amount of articles on the page is unknown, so a while loop is # used to iterate until no more new articles are found if verbose: print('Scraping the site...') it_num = 0 while True: try: if verbose: if it_num % 10 == 0: print('{} - Scrape: Iteration #{}'.format( stock, it_num)) it_num += 1 # This is the xpath for the title of the article xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format( i) i += 1 # An error will be thrown if there are no more articles left because # the driver won't be able to find the non-existent next article. header = driver.find_element_by_xpath( xpath).text # Get the article's header text link = driver.find_element_by_xpath(xpath).get_attribute( 'href') # Get the link of the article # The links are processed by a seperate script because # less threads have to be devoted to datas.append([header, link]) except Exception as e: tol += 1 # Increase the error tally by 1 time.sleep(30) # print(datas) if tol >= 2: # The script will only break if 3 errors in a row are thrown to confirm # it's actually found all the articles break datas = pd.DataFrame(datas, columns=[ 'text', 'link' ]) # Compile the list of headers and links into a pandas DataFrame if verbose == False: print('Scraping for further information....') links_data = Parallel(1, 'threading', verbose=0)( delayed(convert_link_to_data)(link) for link in datas['link'].values.tolist()) links_data = pd.DataFrame( links_data, columns=['author', 'publish_date', 'body_text']) if massive_scrape_mode == True: links_data.to_csv( 'reuters_data/{}.csv'.format(stock.replace('.', '_')) ) # Export the data to the reuters data folder under the name {stock}.csv else: pass else: if verbose: print('Stock not found on reuters.') # Stop the driver driver.quit() try: if massive_scrape_mode == False: return links_data except: if massive_scrape_mode == False: return False except Exception as e: print(e) time.sleep( 30 ) # Make this worker wait a bit before killing incase Reuters.com # is acting up driver.quit() # The webdriver will be killed upon receiving an error
def dump(update=UPDATE_MODE): # open browser profile = webdriver.FirefoxProfile() if HEADLESS: profile.set_preference('permissions.default.image', 2) profile.set_preference('browser.migration.version', 9001) options = webdriver.FirefoxOptions() options.add_argument('log-level=3') # FATAL options.add_argument('--disable-extensions') if HEADLESS: options.add_argument('-headless') browser = webdriver.Firefox(options=options, firefox_profile=profile) browser.implicitly_wait(5) wait = WebDriverWait(browser, 10, poll_frequency=1) # login browser.get(URL_BASE) wait.until(cond.frame_to_be_available_and_switch_to_it('login_frame')) wait.until(cond.element_to_be_clickable((By.ID, 'switcher_plogin'))) browser.find_element_by_id('switcher_plogin').click() qq, passwd = os.getenv('qq'), os.getenv('passwd') if qq and passwd: print('[Login] using QQ code and password from envvar') browser.find_element_by_id('u').send_keys(qq) browser.find_element_by_id('p').send_keys(passwd) wait.until(cond.element_to_be_clickable((By.ID, 'login_button'))) browser.find_element_by_id('login_button').click() time.sleep(5) # await for cookies else: print('[Login] manual interactive login then press Enter to continue') input() def getCSRFToken(skey): # transcribed from qzone source javascript snippet hs = 5381 for i in skey: hs += (hs << 5) + ord(i) return hs & 0x7fffffff cookies = {c.get('name'): c.get('value') for c in browser.get_cookies()} token = getCSRFToken(browser.get_cookie('p_skey').get('value')) qzonetoken = '6938137d34171f799bd85ccfb42b80474825b4c604adfc9adbfae0bc512241f658514dea51e79051be8f' browser.close() browser.quit() # dump http = Session() http.headers.update(HEADERS) cnt, totcnt, pid = 0, -1, 1 params = { 'uin': qq, 'ftype': 0, 'sort': 0, 'pos': 0, 'num': PAGE_SIZE, 'replynum': 0, 'g_tk': token, 'callback': 'preloadCallback', 'code_version': 1, 'format': 'jsonp', 'need_private_comment': 1, 'qzonetoken': qzonetoken, } while pid >= 0: if totcnt > 0 and pid * PAGE_SIZE > totcnt: break time.sleep(1) logging.info('[Dumper] crawling on page %d' % pid) params['pos'] = (pid - 1) * PAGE_SIZE res = http.get(URL_LIST, params=params, cookies=cookies) data = json.loads(res.text[16:-2]) if totcnt < 0: totcnt = data.get('total') logging.info('[Dumper] %d mood in total' % totcnt) for mood in data.get('msglist'): timestamp = datetime.fromtimestamp(mood.get('created_time')) if Mood.exists(timestamp=timestamp): if update: return else: continue if mood.get('has_more_con'): # expand if has_more params['tid'] = mood.get('tid') params['t1_source'] = mood.get('t1_source') params['pos'] = 0 # reset offset res = http.get(URL_DETAIL, params=params) try: data = json.loads(res.text[16:-2]) except json.decoder.JSONDecodeError: logging.info(res.text) content = data.get('content') title = '\n' in content and content.split('\n')[0] or content[:16] mood = Mood(title=title, content=content, timestamp=timestamp) mood.save() logging.info('[Save] %r' % mood) cnt += 1 pid += 1 logging.info('[Save] updated %d items in total' % cnt)
def get_data_for_stock_lb_base(stock: str, days_to_look_back: int): # get_data_for_stock() # Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory. # Also, a folder called "processed" contains more folders whose names are of the # stocks that have already been processed. This is for making it easy to just run the # script again if it is stopped (e.g. your PC crashes, you have to kill the script) # Input: stock (str) - ticker symbol of a designated stock # Output: None # Set the Firefox webdriver to run headless in the background fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() driver = webdriver.Firefox( options=fireFoxOptions ) # Initialize the webdriver instance in the background try: # Search Reuters for {stock} driver.get('https://www.reuters.com/search/news?blob={}'.format(stock)) time.sleep(2) # Wait for the page to load # Reuters should query the company if they have written articles on it. # If they haven't, an error will be thrown and no files will be outputted # This line gets the company they queried's element which contains # the stock's name and URL to Reuters's page on the stock text = driver.find_element_by_xpath( '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a' ).text # {condition} will determine if the stock queried is actually # the stock that we're trying to get articles on condition = False # Reuters will format the element's text in 2 ways: # {company name} ({ticker}.{some additional text}) # e.g. # Apple Inc (AAPL.OQ) # or # {company name} ({ticker}) # e.g. # Alcoa Corp (AA) # The following lines of code will filter down the element's text # to just get {ticker} # e.g. # Apple Inc (AAPL.OQ) --> AAPL # Aloca Corp (AA) --> AA if '.' in text: # Check if a period is in the text # Check if {ticker}.upper() == {stock}.upper() if text[text.find('(') + 1:text.find('.')].upper() == stock.upper(): condition = True # {condition} = True means that the queried stock is a match else: # If there is no period # Check if {ticker}.upper() == {stock}.upper() if text[text.find('(') + 1:text.find(')')].upper() == stock.upper(): condition = True # {condition} = True means that the queried stock is a match if condition: # If {stock} has been found in Reuters, continue # Click the element's link, going to Reuters's driver.find_element_by_xpath( '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a' ).click() time.sleep(0.5) # Let the stock's Reuters page load # Go to the "News" section of the stock's Reuters page driver.find_element_by_xpath( '/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button' ).click() time.sleep(5) # The next segment will scroll down to the bottom of the "News" # page of the stock's Reuters page SCROLL_PAUSE_TIME = 0.5 # This is how much time it waits before scrolling to # the bottom of the page again # Get the last height of the page last_height = driver.execute_script( "return document.body.scrollHeight") while True: # Scroll to the bottom of the page driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait for more content to load time.sleep(SCROLL_PAUSE_TIME) # Get the current height of the page new_height = driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: # If the current height of the page is the same as it was before, # break the script because there is no more content to load. break last_height = new_height # Get the previous height of the page i = 1 # Article index number starts at one in the HTML datas = [] # Put all of the data in here tol = 0 # Tol is the amount of times an error has been thrown (for this stock news page). # When it hits 3, it's confirmed that all articles for this stock # have been queried. # The amount of articles on the page is unknown, so a while loop is # used to iterate until no more new articles are found while True: try: # This is the xpath for the title of the article xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format( i) # /html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[4]/div/div/time i += 1 # An error will be thrown if there are no more articles left because # the driver won't be able to find the non-existent next article. header = driver.find_element_by_xpath( xpath).text # Get the article's header text link = driver.find_element_by_xpath(xpath).get_attribute( 'href') # Get the link of the article date = driver.find_element_by_xpath( '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/div/time' .format(i)).text #/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[7]/div/div/time datas.append([header, link]) # # print(header, link, date) try: units_behind = date[::-1] units_behind = units_behind[units_behind.find(' ') + 1:][::-1] units_behind = pd.Timedelta(units_behind) if units_behind > pd.Timedelta( '{} days'.format(days_to_look_back)): break except: pass # The links are processed by a seperate script because # less threads have to be devoted to except Exception as e: # # print(e) tol += 1 # Increase the error tally by 1 time.sleep(30) if tol > 2: # The script will only break if 3 errors in a row are thrown to confirm # it's actually found all the articles break datas = pd.DataFrame(datas, columns=[ 'text', 'link' ]) # Compile the list of headers and links into a pandas DataFrame # datas.to_csv('reuters_data/{}.csv'.format(stock)) # Export the data to the reuters data folder under the name {stock}.csv # Stop the driver driver.quit() return datas except Exception as e: time.sleep( 30 ) # Make this worker wait a bit before killing incase Reuters.com # is acting up driver.quit() # The webdriver will be killed upon receiving an error # to save space on RAM return np.nan
def setUp(self): options = webdriver.FirefoxOptions() options.add_argument('-headless') self.driver = webdriver.Firefox(options=options)
class WPSpider(scrapy.Spider): name = "washingtonPostCrawler" start_urls = [ 'https://www.washingtonpost.com/newssearch/?datefilter=7%20Days&query=abu&sort=Date' ] base_url = 'https://www.washingtonpost.com/newssearch/?datefilter=7%20Days&query=abu&sort=Date' fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.set_headless() options = webdriver.ChromeOptions() options.binary_location = '/usr/bin/google-chrome-unstable' options.add_argument('headless') #TBD options.add_argument('no-sandbox') options.add_argument('disable-gpu') options.add_argument('disable-dev-shm-usage') options.add_argument('window-size=1200x600') folder = "./WashingtonPost" timestamp = datetime.datetime.now() resultsPath = "./WashingtonPost" + "/" + str(timestamp) next_page = True number_pages = 1 number_of_search_result = 0 def __init__(self): #self.driver = webdriver.Firefox(firefox_options=self.fireFoxOptions) #self.driver = webdriver.Chrome('chromedriver',chrome_options=self.chromeOptions) self.driver = webdriver.Chrome(chrome_options=self.options) self.driver.implicitly_wait(3) self.driver.get(self.base_url) # Create folder and subfolder based on command execution timestamp try: os.makedirs(self.resultsPath) except: print("Folder existed") def parse(self, response): # url_selector ='.pb-feed-headline ng-scope a ::attr("href")' # for article_url in response.css(url_selector).extract(): # yield response.follow(article_url, callback=self.parse_article) #driver = self.driver #Open the browser to below link #print("Top of parse") driver = self.driver #Deal with the anti-robot page by clicking buttons driver.find_element_by_xpath( "(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]" ).click() driver.find_element_by_id("agree").click() driver.find_element_by_xpath( "(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]" ).click() # Parse for number of search result and calculate number of pages self.number_of_search_result = int( driver.find_element_by_xpath( ".//span[@class='pb-search-number ng-binding']").text) self.number_pages = int(self.number_of_search_result / 20) + 1 print("\nNumber of search result:%d\n" % self.number_of_search_result) print("\nNumber of pages:%d\n" % self.number_pages) for current_page in range(self.number_pages): #Now enter the search result page #Start to parse current page's article via links elements = driver.find_elements_by_css_selector('a.ng-binding') for elem in elements: article_url = elem.get_attribute("href") print("Article link:%s" % article_url) if (article_url != None and article_url != ""): single_article_driver = webdriver.Chrome( chrome_options=self.options) #single_article_driver = webdriver.Firefox(firefox_options=self.fireFoxOptions) #single_article_driver = webdriver.Chrome('chromedriver',chrome_options=self.chromeOptions) time.sleep(2) single_article_driver.get(article_url) single_article_driver.find_element_by_xpath( "(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]" ).click() single_article_driver.find_element_by_id("agree").click() single_article_driver.find_element_by_xpath( "(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]" ).click() #time.sleep(4) try: title_CSS_SELECTOR = 'div.topper-headline' content_xpath = ".//article[@class='paywall']" #driver.refresh() title = single_article_driver.find_element_by_css_selector( title_CSS_SELECTOR).text content = single_article_driver.find_element_by_xpath( content_xpath).text #content=driver.find_elements_by_xpath(".//p") print('title is:%s' % title) print('content is:%s' % content) yield { 'title': ''.join(title), 'url': ''.join(article_url), 'content': ''.join(content) } self.file_write(title, article_url, content) single_article_driver.close() continue except: #driver.back() single_article_driver.close() print("This is in except!") continue #Config the next pages's url next_page_url = self.base_url + "&startat=" + str( (current_page + 1) * 20) print("\nNext page url:%s\n" % next_page_url) driver.get(next_page_url) time.sleep(5) continue #while (self.next_page): #current_search_result_url = driver.current_url #print("Current search result url: %s" %current_search_result_url) #driver.get(current_search_result_url) # elements = driver.find_elements_by_css_selector('a.ng-binding') # for elem in elements: # article_url = elem.get_attribute("href") # print("Article link:%s" %article_url) # if(article_url != None): # #single_article_driver = webdriver.Firefox() # single_article_driver = webdriver.Firefox(firefox_options=self.fireFoxOptions) # time.sleep(2) # single_article_driver.get(article_url) # single_article_driver.find_element_by_xpath("(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]").click() # single_article_driver.find_element_by_id("agree").click() # single_article_driver.find_element_by_xpath("(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]").click() # #time.sleep(4) # try: # title_CSS_SELECTOR = 'div.topper-headline' # content_xpath = ".//article[@class='paywall']" # #driver.refresh() # title = single_article_driver.find_element_by_css_selector(title_CSS_SELECTOR).text # content = single_article_driver.find_element_by_xpath(content_xpath).text # #content=driver.find_elements_by_xpath(".//p") # print('title is:%s' %title) # print('content is:%s' %content) # yield { # 'title':''.join(title), # 'url':''.join(article_url), # 'content': ''.join(content) # } # single_article_driver.close() # continue # except: # #driver.back() # single_article_driver.close() # print("This is in except!") # continue #driver.get(current_search_result_url) #Check whether there is next page # try: # #if(driver.find_element_by_xpath(".//li[@ng-if='::directionLinks']")): # self.next_page = True # self.number_search_result_page+=1 # next_page_url = self.base_url+ "&startat=" + str((self.number_search_result_page-1)*20) # #print("Next page url:%s" %next_page_url) # driver.get(next_page_url) # time.sleep(5) # except: # current_search_result_url = driver.current_url # print("Current search result url: %s" %current_search_result_url) # print("#############Only %d of search result pages" %self.number_search_result_page) # self.next_page = False # break # if(next_page): # print("############inside next page function") # next_page.click() # time.sleep(5) # yield response.follow(driver.current_url, callback=self.parse) #for article_url in driver.find_elements_by_xpath(".//div[@class='search-result-story__headline']"): # for article_url in response.css('.search-result-story__headline a ::attr("href")').extract(): driver.close() def parse_article(self, response): content = response.xpath( ".//article[@itemprop='articleBody']/descendant::text()").extract( ) yield {'article': ''.join(content)} def file_write(self, title, url, content): #print("######################file method inside######################") fileName = str(title) + ".txt" file = open(self.resultsPath + "/" + fileName, "w") # Write titile, URL, content into the file file.write(str(title) + "\n") file.write(str(url) + "\n") file.write(str(content)) file.close()
def inicializar_webdriver_firefox(self): archivo_config_ini = FormatUtils.lector_archivo_ini() modo_headless = archivo_config_ini.getboolean('Driver', 'headless') mandar_log_a_dev_null = archivo_config_ini.getboolean( 'Driver', 'log_path_dev_null') data_profile = archivo_config_ini.get('Driver', 'data_profile') #profile_data = archivo_config_ini.getboolean('Driver', 'data_profile') mimeTypes = "application/zip, application/octet-stream, image/jpeg, image/png, image/x-png, " \ "application/vnd.ms-outlook, text/html, application/pdf, image/png" # ruta para deshabilitar log inecesario del geckodriver opciones_firefox = webdriver.FirefoxOptions() perfil_firefox = webdriver.FirefoxProfile(data_profile) firefox_capabilities = webdriver.DesiredCapabilities().FIREFOX.copy() firefox_capabilities.update({ 'acceptInsecureCerts': True, 'acceptSslCerts': True }) firefox_capabilities['acceptSslCerts'] = True # ignora las certificaciones de seguridad, esto solamente se realiza para la experiencia de usuario opciones_firefox.add_argument('--ignore-certificate-errors') opciones_firefox.accept_insecure_certs = True perfil_firefox.accept_untrusted_certs = True perfil_firefox.assume_untrusted_cert_issuer = False perfil_firefox.set_preference("browser.download.folderList", 2) perfil_firefox.set_preference("browser.download.dir", config_constantes.PATH_CARPETA_DESCARGA) perfil_firefox.set_preference( "browser.download.manager.showWhenStarting", False) perfil_firefox.set_preference("browser.helperApps.neverAsk.saveToDisk", mimeTypes) perfil_firefox.set_preference( "browser.download.viewableInternally.enabledTypes", "") perfil_firefox.update_preferences() opciones_firefox.headless = modo_headless if mandar_log_a_dev_null: param_log_path = config_constantes.DEV_NULL else: param_log_path = None try: webdriver_firefox = webdriver.Firefox( executable_path=self.ruta_web_driver, firefox_options=opciones_firefox, firefox_profile=perfil_firefox, capabilities=firefox_capabilities, log_path=param_log_path) except FileNotFoundError as e: print('Sucedio un error al intentar configurar el webdriver: {}'. format(e)) sys.exit() except Exception as e: print( 'Sucedio una excepcion al intentar configurar el webdriver {}'. format(e)) sys.exit() return webdriver_firefox
def get_webdriver(which_driver="gecko", headless=False): """Initialize web driver. Will use the Gecko web driver unless an "chrome" is passed to the "which_driver" argument. Args: which_driver (string): use either gecko or chrome webdriver. headless (bool): initialize webdriver in headless mode. Returns: driver (web driver object): -1 on failure Raises: Exception: something wrong with web driver """ driver_path = os.path.dirname(os.path.realpath(__file__)) # Gecko Driver Usage if which_driver == "gecko": geckodriver = os.path.join(driver_path, "webdrivers", f"geckodriver-{MIN_GECKO_DRIVER_VERSION}") logging.info(f"using webdriver version {MIN_GECKO_DRIVER_VERSION}") logging.info(f"webdriver located at: {geckodriver}") try: if headless: logging.info("running headless") options = webdriver.FirefoxOptions() options.add_argument("-headless") logging.info("initializing headless Gecko webdriver") driver = webdriver.Firefox( executable_path=geckodriver, firefox_options=options, service_log_path="/dev/null", ) # specify webdriver window resolution, helps clicking driver.set_window_size(1440, 900) else: logging.info("initializing Gecko webdriver") driver = webdriver.Firefox(executable_path=geckodriver, service_log_path="/dev/null") except Exception as err: logging.debug(err) return -1 logging.info("geckodriver ready.") return driver # ChromeDriver Usage else: chromedriver = os.path.join( driver_path, "webdrivers", f"chromedriver_{MIN_CHROME_DRIVER_VERSION}") logging.info(f"using webdriver version {MIN_CHROME_DRIVER_VERSION}") logging.info(f"webdriver located at: {chromedriver}") try: if headless: logging.info("running headless") options = webdriver.ChromeOptions() options.add_argument("headless") logging.info("initializing headless Chrome webdriver") driver = webdriver.Chrome(chromedriver, options=options, service_log_path="/dev/null") # specify webdriver window resolution, helps clicking driver.set_window_size(1440, 900) else: logging.info("initializing Chrome webdriver") driver = webdriver.Chrome(chromedriver, service_log_path="/dev/null") except Exception as err: logging.debug(err) return -1 logging.info("chromedriver ready.") return driver
def driver(): options = webdriver.FirefoxOptions() options.add_argument("-Headless") driver = webdriver.Firefox(options=options) return driver
def working(): global driver f = open(r"xs.txt", "r", encoding='utf-8') s = f.read() s = s.split() ff = open(r"状态.txt", "r", encoding='utf-8-sig') # 读取状态文本 aa = ff.read() aa = s.index(aa) for i in range(len(s) - aa): print(eval(s[i + aa])) run_log(eval(s[i + aa])) fff = open(r"状态.txt", "w", encoding='utf-8') fff.write(s[i + aa]) fff.close() dict0 = eval(s[i + aa]) profile_ = getProfirle() options = webdriver.FirefoxOptions() options.add_argument('--headless') driver = webdriver.Firefox(executable_path="driver\geckodriver", firefox_profile=profile_) url = "http://wenshu.court.gov.cn/website/wenshu/181217BMTKHNT2W0/index.html" driver.get(url) maxWait = 50 for x in range(maxWait): try: if ((x + 1) % 10) == 0: driver.refresh() a = driver.find_element_by_css_selector( '#_view_1545034775000 > div > div.search-wrapper.clearfix > div.advenced-search') a.click() find(dict0, driver) break except Exception as e: if x == maxWait - 1: raise ValueError("高级检索") from e time.sleep(0.5) judgePath = "#_view_1545184311000 > div:nth-child(3) > div.list_title.clearfix > h4 > a" all_number = caseNember(driver)[1] if all_number <= 600 and all_number > 15: set15CasePerPg(driver, maxWait=20) caseList(driver) getdata(data) data.clear() pageNember = 1 Auto_pageturn(pageNember, driver, judgePath, maxWait=20) webdriver.Firefox.quit(driver) continue elif (all_number > 0 and all_number <= 15): set15CasePerPg(driver, maxWait=20) caseList(driver) getdata(data) data.clear() webdriver.Firefox.quit(driver) continue elif all_number <= 0: webdriver.Firefox.quit(driver) continue else: global area_list maxWait = 50 for x in range(maxWait): try: if ((x + 1) % 10) == 0: driver.refresh() area_str = driver.find_element_by_xpath('//*[@id="_view_1545095958000"]/div/div[2]') area_list = re.findall(u'[\u4e00-\u9fa5]+', area_str.text, re.S) print(len(area_list)) print(area_list) break except Exception as e: if x == maxWait - 1: raise ValueError("地域列表") from e time.sleep(0.5) area_dict = {'最高人民法院': '//*[@id="0_anchor"]', '北京市': '//*[@id="100_anchor"]', '天津市': '//*[@id="200_anchor"]', '河北省': '//*[@id="300_anchor"]', '山西省': '//*[@id="400_anchor"]', '内蒙古自治区': '//*[@id="500_anchor"]', '辽宁省': '//*[@id="600_anchor"]', '吉林省': '//*[@id="700_anchor"]', '黑龙江省': '//*[@id="800_anchor"]', '上海市': '//*[@id="900_anchor"]', '江苏省': '//*[@id="A00_anchor"]', '浙江省': '//*[@id="B00_anchor"]', '安徽省': '//*[@id="C00_anchor"]', '福建省': '//*[@id="D00_anchor"]', '江西省': '//*[@id="E00_anchor"]', '山东省': '//*[@id="F00_anchor"]', '河南省': '//*[@id="G00_anchor"]', '湖北省': '//*[@id="H00_anchor"]', '湖南省': '//*[@id="I00_anchor"]', '广东省': '//*[@id="J00_anchor"]', '广西壮族自治区': '//*[@id="K00_anchor"]', '海南省': '//*[@id="L00_anchor"]', '重庆市': '//*[@id="M00_anchor"]', '四川省': '//*[@id="N00_anchor"]', '贵州省': '//*[@id="O00_anchor"]', '云南省': '//*[@id="P00_anchor"]', '西藏自治区': '//*[@id="Q00_anchor"]', '陕西省': '//*[@id="R00_anchor"]', '甘肃省': '//*[@id="S00_anchor"]', '青海省': '//*[@id="T00_anchor"]', '宁夏回族自治区': '//*[@id="U00_anchor"]', '新疆维吾尔自治区': '//*[@id="V00_anchor"]', '新疆维吾尔自治区高级人民法院生产建设兵团分院': '//*[@id="X00_anchor"]'} farea = open('地域.txt', 'r', encoding='utf-8') areaname = farea.read() if areaname == 'area': bb = 0 else: bb = area_list.index(areaname) for area in range(len(area_list) - bb): global k1 maxWait = 50 for x in range(maxWait): try: if ((x + 1) % 10) == 0: driver.refresh() area_ss = area_dict[area_list[area + bb]] area_c = driver.find_element_by_xpath(area_ss) time.sleep(0.5) area_c.click() time.sleep(0.5) k1, part_number = caseNember(driver) if part_number >= all_number: continue else: break except Exception as e: if x == maxWait - 1: raise ValueError("地域点击") from e time.sleep(0.5) if area == (len(area_list)-bb-1): farea = open('地域.txt', 'w', encoding='utf-8') farea.write('area') farea.close() else: farea = open('地域.txt', 'w', encoding='utf-8') farea.write(str(area_list[area + bb])) farea.close() if k1 == 1: docodition(dict0) # webdriver.Firefox.quit(driver) continue if k1 == 2: wpagenumber(1) set15CasePerPg(driver, maxWait=20) caseList(driver) getdata(data) data.clear() wpagenumber(0) docodition(dict0) # webdriver.Firefox.quit(driver) continue else: if bb == 0: set15CasePerPg(driver, maxWait=20) wpagenumber(1) caseList(driver) getdata(data) data.clear() pageNember = 1 Auto_pageturn(pageNember, driver, judgePath, maxWait=20) docodition(dict0) else: g = open(r"页码.txt", "r", encoding='utf-8-sig') ym = g.read() if ym != '0': pageNember = int(ym) set15CasePerPg(driver, maxWait=20) tiaoye(pageNember, driver, ym) caseList(driver) getdata(data) data.clear() Auto_pageturn(pageNember, driver, judgePath, maxWait=20) docodition(dict0) # webdriver.Firefox.quit(driver) continue else: set15CasePerPg(driver, maxWait=20) wpagenumber(1) caseList(driver) getdata(data) data.clear() pageNember = 1 Auto_pageturn(pageNember, driver, judgePath, maxWait=20) docodition(dict0) webdriver.Firefox.quit(driver)
def _set_firefox_options(downloads_path, headless, locale_code, proxy_string, user_agent, disable_csp): options = webdriver.FirefoxOptions() options.accept_untrusted_certs = True options.set_preference("reader.parse-on-load.enabled", False) options.set_preference("pdfjs.disabled", True) options.set_preference("app.update.auto", False) options.set_preference("app.update.enabled", False) options.set_preference("app.update.silent", True) options.set_preference("browser.formfill.enable", False) options.set_preference("browser.privatebrowsing.autostart", True) options.set_preference("devtools.errorconsole.enabled", True) options.set_preference("dom.webnotifications.enabled", False) options.set_preference("dom.disable_beforeunload", True) options.set_preference("browser.contentblocking.database.enabled", False) options.set_preference("extensions.allowPrivateBrowsingByDefault", True) options.set_preference("extensions.PrivateBrowsing.notification", False) options.set_preference("extensions.systemAddon.update.enabled", False) options.set_preference("extensions.update.autoUpdateDefault", False) options.set_preference("extensions.update.enabled", False) options.set_preference("extensions.update.silent", True) options.set_preference("datareporting.healthreport.logging.consoleEnabled", False) options.set_preference("datareporting.healthreport.service.enabled", False) options.set_preference("datareporting.healthreport.service.firstRun", False) options.set_preference("datareporting.healthreport.uploadEnabled", False) options.set_preference("datareporting.policy.dataSubmissionEnabled", False) options.set_preference("datareporting.policy.dataSubmissionPolicyAccepted", False) options.set_preference("toolkit.telemetry.unified", False) if proxy_string: socks_proxy = False socks_ver = 0 chunks = proxy_string.split(':') if len(chunks) == 3 and (chunks[0] == "socks4" or chunks[0] == "socks5"): socks_proxy = True socks_ver = int(chunks[0][5]) if chunks[1].startswith("//") and len(chunks[1]) > 2: chunks[1] = chunks[1][2:] proxy_server = chunks[1] proxy_port = chunks[2] else: proxy_server = proxy_string.split(':')[0] proxy_port = proxy_string.split(':')[1] options.set_preference("network.proxy.type", 1) if socks_proxy: options.set_preference('network.proxy.socks', proxy_server) options.set_preference('network.proxy.socks_port', int(proxy_port)) options.set_preference('network.proxy.socks_version', socks_ver) else: options.set_preference("network.proxy.http", proxy_server) options.set_preference("network.proxy.http_port", int(proxy_port)) options.set_preference("network.proxy.ssl", proxy_server) options.set_preference("network.proxy.ssl_port", int(proxy_port)) if user_agent: options.set_preference("general.useragent.override", user_agent) options.set_preference("security.mixed_content.block_active_content", False) if settings.DISABLE_CSP_ON_FIREFOX or disable_csp: options.set_preference("security.csp.enable", False) options.set_preference("browser.download.manager.showAlertOnComplete", False) if headless and "linux" not in PLATFORM: options.add_argument("--headless") if locale_code: options.set_preference("intl.accept_languages", locale_code) options.set_preference("browser.shell.checkDefaultBrowser", False) options.set_preference("browser.startup.page", 0) options.set_preference("browser.download.panel.shown", False) options.set_preference("browser.download.animateNotifications", False) options.set_preference("browser.download.dir", downloads_path) options.set_preference("browser.download.folderList", 2) options.set_preference("browser.helperApps.alwaysAsk.force", False) options.set_preference("browser.download.manager.showWhenStarting", False) options.set_preference( "browser.helperApps.neverAsk.saveToDisk", ("application/pdf, application/zip, application/octet-stream, " "text/csv, text/xml, application/xml, text/plain, " "text/octet-stream, application/x-gzip, application/x-tar " "application/" "vnd.openxmlformats-officedocument.spreadsheetml.sheet")) return options
def __init__(self, **kwargs): scrapy.Spider.__init__(self, **kwargs) options = webdriver.FirefoxOptions() #options.add_argument('--headless') self.driver = webdriver.Firefox(firefox_options=options)
def send_text(self): #profile = webdriver.FirefoxProfile() #proxy = '127.0.0.1:10808' #ip, port = proxy.split(":") #port = int(port) ## 不使用代理的协议,注释掉对应的选项即可 #settings = { # 'network.proxy.type': 1, # 'network.proxy.http': ip, # 'network.proxy.http_port': port, # 'network.proxy.ssl': ip, # https的网站, # 'network.proxy.ssl_port': port, #} # ## 更新配置文件 #for key, value in settings.items(): # profile.set_preference(key, value) #profile.update_preferences() # options = webdriver.FirefoxOptions() options.add_argument('-headless') # 无头参数 #https://sites.google.com/a/chromium.org/chromedriver/home #driver = webdriver.Chrome(r'C:/Python27/Scripts/chromedriver') #https://github.com/mozilla/geckodriver/releases driver = webdriver.Firefox(executable_path='geckodriver', options=options) #driver = webdriver.Firefox(firefox_profile=profile, options=options) #driver = webdriver.Firefox(proxy = proxy) #这两种设置都进行才有效 #driver.set_page_load_timeout(5) #driver.set_script_timeout(5) try: driver.get(self.url) except: pass #强制等待8s,主要是等待reCaptcha加载 time.sleep(8) # 分辨率 1920*1080 driver.set_window_size(1920, 1080) time.sleep(3) #presence_of_element_located: 当我们不关心元素是否可见,只关心元素是否存在在页面中。 #visibility_of_element_located: 当我们需要找到元素,并且该元素也可见。 WebDriverWait(driver, 3).until( EC.visibility_of_element_located( (By.XPATH, "//input[@name='username']"))) uname_box = driver.find_element_by_xpath("//input[@name='username']") pass_box = driver.find_element_by_xpath("//input[@name='password']") uname_box.send_keys(self.TN_USER) pass_box.send_keys(self.TN_PASS) login_btn = driver.find_element_by_xpath("//button[@type='submit']") login_btn.click() #显性等待,每隔3s检查一下条件是否成立 try: WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.XPATH, "//button[@id='newText']"))) except: pass print(u'登录成功') # 隐性等待,最长等待30秒 driver.implicitly_wait(30) # remove通知提示框 driver.execute_script( "document.querySelectorAll('#recent-header .toast-container').forEach(function(e,i){console.log(e.href)})" ) time.sleep(1) driver.execute_script( "document.querySelectorAll('.notification-priming-modal').forEach(function(e,i){console.log(e.href)})" ) time.sleep(1) driver.execute_script("$('#recent-header .toast-container').remove();") driver.execute_script("$('.notification-priming-modal').remove();") driver.execute_script("$('.modal').remove();") time.sleep(2) for phone in self.PHONE_NUMBER.split(','): try: print(u'开始给%s发短信' % (phone.replace(''.join(list(phone)[-4:]), '****'))) #点击 新建短信按钮 try: new_text_btn = driver.find_element_by_id("newText") if new_text_btn.is_displayed(): new_text_btn.click() else: driver.execute_script("arguments[0].scrollIntoView();", new_text_btn) if new_text_btn.is_displayed(): new_text_btn.click() else: driver.execute_script("$(arguments[0]).click()", "#newText") except: driver.execute_script("$(arguments[0]).click()", "#newText") time.sleep(2) #输入:短信内容 try: text_field = driver.find_element_by_id("text-input") if text_field.is_displayed(): text_field.click() text_field.send_keys(self.MESSAGE) else: driver.execute_script("arguments[0].scrollIntoView();", text_field) if text_field.is_displayed(): text_field.click() text_field.send_keys(self.MESSAGE) else: driver.execute_script( "$(arguments[0]).val('arguments[1]')", "#text-input", self.MESSAGE) except: driver.execute_script( "$(arguments[0]).val('arguments[1]')", "#text-input", self.MESSAGE) time.sleep(2) #输入号码 try: number_field = driver.find_element_by_class_name( "newConversationTextField") if number_field.is_displayed(): number_field.send_keys(phone) else: driver.execute_script("arguments[0].scrollIntoView();", number_field) if number_field.is_displayed(): number_field.send_keys(phone) else: driver.execute_script( "$(arguments[0]).val('arguments[1]')", ".newConversationTextField", phone) except: driver.execute_script( "$(arguments[0]).val('arguments[1]')", ".newConversationTextField", phone) time.sleep(10) #点击短信内容 try: text_field = driver.find_element_by_id("text-input") if text_field.is_displayed(): text_field.click() else: driver.execute_script("arguments[0].scrollIntoView();", text_field) if text_field.is_displayed(): text_field.click() else: driver.execute_script("$(arguments[0]).focus()", "#text-input") except: driver.execute_script("$(arguments[0]).focus()", "#text-input") time.sleep(5) #点击发送按钮 try: send_btn = driver.find_element_by_id("send_button") if send_btn.is_displayed(): send_btn.click() else: driver.execute_script("arguments[0].scrollIntoView();", send_btn) if send_btn.is_displayed(): send_btn.click() else: driver.execute_script("$(arguments[0]).click()", "#send_button") driver.execute_script( "setTimeout($(arguments[0]).click,2000)", "#send_button") except: driver.execute_script("$(arguments[0]).click()", "#send_button") driver.execute_script( "setTimeout($(arguments[0]).click,2000)", "#send_button") time.sleep(5) #注销账号 driver.execute_script('window.location.href="/logout"') time.sleep(10) #执行页面刷新 #try: # driver.get(self.url.replace('/login','/messaging')) # # time.sleep(10) # # 隐性等待,最长等待30秒 # driver.implicitly_wait(30) # WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.XPATH, "//button[@id='newText']"))) # print (u'刷新页面完成') #except: # pass except: print(u'给%s发短信时发生异常:' % phone) info = sys.exc_info() #print(info) #print(info[0]) print(info[1]) time.sleep(2) pass continue print(u'处理完毕---end') driver.close()
def process_request(self, request, spider): data = [] url = request.url # webdriver setting options = webdriver.FirefoxOptions() options.add_argument('--headless') # options.add_argument('--proxy-server=%s' % request.meta["proxy"]) options.add_argument('--user-agent=%s' % request.headers["User-Agent"]) # webdriver request driver = webdriver.Firefox(executable_path=GECKODRIVER, firefox_options=options, firefox_binary=FIREFOX_BINARY) driver.set_window_size(1440, 800) driver.delete_all_cookies() driver.get(url) loguru.logger.info("Hold URL {url}".format(url=url)) data.append(driver.page_source) # clean popup try: popup_xpath = ( './/div[@class = "cmc-cookie-policy-banner__close"]') popup_element = WebDriverWait(driver, 60).until( EC.element_to_be_clickable((By.XPATH, popup_xpath))) loguru.logger.warning(popup_element.text) popup_element.click() time.sleep(5) # Crawl from 3 years ago start from Nov 17 2017 to now #Set start date start_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[3]/div/ul[2]/li[5]/div/div/div[1]/div/div/span/span/div/div[1]/input' ))) start_element.send_keys(Keys.BACKSPACE * 12) start_element.send_keys("Nov 17, 2017") start_element.send_keys(Keys.RETURN) time.sleep(5) # Set end date end_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[3]/div/ul[2]/li[5]/div/div/div[1]/div/div/span/span/div/div[2]/input' ))) end_element.send_keys(Keys.BACKSPACE * 12) end_element.send_keys("Oct 18, 2020") # end_element.send_keys(Keys.RETURN) time.sleep(5) #driver.refresh() url = driver.current_url loguru.logger.info("Hold URL {url}".format(url=url)) data.append(driver.page_source) except: driver.quit() return scrapy.http.HtmlResponse(url=url, status=200, body=json.dumps(data).encode('utf-8'), encoding='utf-8')
from selenium import webdriver from fake_useragent import UserAgent import time # fake_user agent user_agent = UserAgent() # options options = webdriver.FirefoxOptions() # user-agent options.set_preference("general.useragent.override", f"user-agent={user_agent.random}") # disable webdriver mode options.set_preference("dom.webdriver.enabled", False) driver = webdriver.Firefox( executable_path="/home/cain/PycharmProjects/selenium_python/" "firefoxdriver/geckodriver", options=options) # "C:\\users\\selenium_python\\chromedriver\\chromedriver.exe" # r"C:\users\selenium_python\chromedriver\chromedriver.exe" try: driver.get("https://intoli.com/blog/not-possible-to-block-chrome-headless/" "chrome-headless-test.html") time.sleep(10)
def extract_play(company, headers, max_results=None, headless=False, phantom=False, gchrome=False, time_sleep=1): if gchrome: chrome_options = webdriver.ChromeOptions() if headless: chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome('chromedriver', options=chrome_options) elif phantom: driver = webdriver.PhantomJS() driver.set_window_size(1120, 550) else: options = webdriver.FirefoxOptions() if headless: options.add_argument('-headless') driver = webdriver.Firefox(firefox_options=options) url = "https://play.google.com/store/apps/details?id=" + company + "&showAllReviews=true" driver.get(url) selector = "h3 + div > div" records = [] last_elems = 0 saved = 0 elems = driver.find_elements_by_css_selector(selector) num_elems = len(elems) while num_elems > last_elems: print(num_elems) num_it = -1 * (num_elems - last_elems) last_elems = num_elems for elem in elems[num_it:]: record = {} spans = elem.find_elements_by_css_selector('span') try: comment = spans[12].text except: comment = '' comment_index = 12 if comment else 13 if not (spans is not None and len(spans) > comment_index): comment_obj = None else: comment_obj = spans[comment_index] try: comment = spans[comment_index].text except: comment = '' buttons = comment_obj.find_elements_by_css_selector( 'div > button') if len(buttons) > 0: click_element(driver, buttons[0]) time.sleep(0.1) spans = elem.find_elements_by_css_selector('span') comment_obj = spans[comment_index + 1] record["author"] = spans[0].text record["date"] = spans[2].text record["review"] = comment try: stars = elem.find_element_by_css_selector( 'div[aria-label][role="img"]') record["rating"] = stars.get_attribute("aria-label") except: record["rating"] = '' try: record["vote_count"] = elem.find_element_by_css_selector( 'div[aria-label="Number of times this review was rated helpful"]' ).text except: record["vote_count"] = '' if comment_obj is not None: siblings = comment_obj.find_elements_by_xpath('../../*') record['reply'] = siblings[2].text if len(siblings) > 2 else '' else: record['reply'] = '' row = [] for header in headers: row.append(record[header]) writer.writerow([unicode(s).encode("utf-8") for s in row]) #records.append(record) saved += 1 #df = pandas.DataFrame(records) #df.to_csv(output, encoding='utf-8') if max_results is not None and saved >= max_results: break driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(time_sleep) elems = driver.find_elements_by_css_selector(selector) num_elems = len(elems) if num_elems == last_elems: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(time_sleep) elems = driver.find_elements_by_css_selector(selector) num_elems = len(elems) if num_elems == last_elems: buttons = driver.find_elements_by_css_selector( 'h3 + div + div > div[role="button"]') if len(buttons) > 0: click_element(driver, buttons[0]) time.sleep(1) elems = driver.find_elements_by_css_selector(selector) num_elems = len(elems) driver.close()