コード例 #1
0
def startBrowser(headlessness=True):
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.headless = headlessness
    browser = webdriver.Firefox(options=fireFoxOptions)
    return browser
コード例 #2
0
from selenium import webdriver
import time, re
print('输入QQ账号:', end='')
username = str(input())  #QQ账号
print('输入登录密码:', end='')
password = str(input())  #QQ密码
print('代码执行中,请稍后')
opt = webdriver.FirefoxOptions()  # 调用火狐
opt.add_argument('--headless')  #后台启动火狐
browser = webdriver.Firefox(
    options=opt,
    executable_path="C:\爬虫驱动\geckodriver.exe")  # 加载驱动 && 创建Firefox无界面对象
# browser = webdriver.Chrome(executable_path="C:\爬虫驱动\chromedriver.exe")#谷歌浏览器
browser.implicitly_wait(2)
browser.get('https://qzone.qq.com/')
browser.switch_to.frame('login_frame')
browser.find_element_by_css_selector("#switcher_plogin").click()
browser.find_element_by_css_selector("#u").send_keys(username)
browser.find_element_by_css_selector("#p").send_keys(password)
browser.find_element_by_css_selector("#login_button").click()
cookies = {}

time.sleep(3)

browser.get_cookies()
print(len(browser.get_cookies()))
for i in browser.get_cookies():
    cookies[i.get('name')] = i.get('value')
print(cookies)
browser.switch_to.frame(None)
qzonetoken = re.findall(
コード例 #3
0
def get_local_driver(
        browser_name, headless, servername,
        proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent,
        disable_csp, enable_sync, user_data_dir,
        extension_zip, extension_dir,
        mobile_emulator, device_width, device_height, device_pixel_ratio):
    '''
    Spins up a new web browser and returns the driver.
    Can also be used to spin up additional browsers for the same test.
    '''
    downloads_path = download_helper.get_downloads_folder()
    download_helper.reset_downloads_folder()

    if browser_name == constants.Browser.FIREFOX:
        try:
            try:
                # Use Geckodriver for Firefox if it's on the PATH
                profile = _create_firefox_profile(
                    downloads_path, proxy_string, user_agent, disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_capabilities['marionette'] = True
                options = webdriver.FirefoxOptions()
                if headless:
                    options.add_argument('-headless')
                if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER):
                    make_driver_executable_if_not(LOCAL_GECKODRIVER)
                elif not is_geckodriver_on_path():
                    if not "".join(sys.argv) == "-c":  # Skip if multithreaded
                        from seleniumbase.console_scripts import sb_install
                        sys_args = sys.argv  # Save a copy of current sys args
                        print("\nWarning: geckodriver not found."
                              " Installing now:")
                        sb_install.main(override="geckodriver")
                        sys.argv = sys_args  # Put back the original sys args
                firefox_driver = webdriver.Firefox(
                    firefox_profile=profile,
                    capabilities=firefox_capabilities,
                    options=options)
            except WebDriverException:
                # Don't use Geckodriver: Only works for old versions of Firefox
                profile = _create_firefox_profile(
                    downloads_path, proxy_string, user_agent, disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_capabilities['marionette'] = False
                firefox_driver = webdriver.Firefox(
                    firefox_profile=profile, capabilities=firefox_capabilities)
            return firefox_driver
        except Exception as e:
            if headless:
                raise Exception(e)
            return webdriver.Firefox()
    elif browser_name == constants.Browser.INTERNET_EXPLORER:
        if not IS_WINDOWS:
            raise Exception(
                "IE Browser is for Windows-based operating systems only!")
        from selenium.webdriver.ie.options import Options
        ie_options = Options()
        ie_options.ignore_protected_mode_settings = False
        ie_options.ignore_zoom_level = True
        ie_options.require_window_focus = False
        ie_options.native_events = True
        ie_options.full_page_screenshot = True
        ie_options.persistent_hover = True
        ie_capabilities = ie_options.to_capabilities()
        if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER):
            make_driver_executable_if_not(LOCAL_IEDRIVER)
        return webdriver.Ie(capabilities=ie_capabilities)
    elif browser_name == constants.Browser.EDGE:
        if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER):
            make_driver_executable_if_not(LOCAL_EDGEDRIVER)
            # The new Microsoft Edge browser is based on Chromium
            chrome_options = _set_chrome_options(
                downloads_path, headless,
                proxy_string, proxy_auth, proxy_user, proxy_pass,
                user_agent, disable_csp, enable_sync, user_data_dir,
                extension_zip, extension_dir, servername, mobile_emulator,
                device_width, device_height, device_pixel_ratio)
            return webdriver.Chrome(executable_path=LOCAL_EDGEDRIVER,
                                    options=chrome_options)
        else:
            return webdriver.Edge()
    elif browser_name == constants.Browser.SAFARI:
        if "".join(sys.argv) == "-c":  # Skip if multithreaded
            raise Exception("Can't run Safari tests in multi-threaded mode!")
        return webdriver.Safari()
    elif browser_name == constants.Browser.OPERA:
        if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER):
            make_driver_executable_if_not(LOCAL_OPERADRIVER)
        return webdriver.Opera()
    elif browser_name == constants.Browser.PHANTOM_JS:
        with warnings.catch_warnings():
            # Ignore "PhantomJS has been deprecated" UserWarning
            warnings.simplefilter("ignore", category=UserWarning)
            return webdriver.PhantomJS()
    elif browser_name == constants.Browser.GOOGLE_CHROME:
        try:
            chrome_options = _set_chrome_options(
                downloads_path, headless,
                proxy_string, proxy_auth, proxy_user, proxy_pass,
                user_agent, disable_csp, enable_sync, user_data_dir,
                extension_zip, extension_dir, servername, mobile_emulator,
                device_width, device_height, device_pixel_ratio)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
            elif not is_chromedriver_on_path():
                if not "".join(sys.argv) == "-c":  # Skip if multithreaded
                    from seleniumbase.console_scripts import sb_install
                    sys_args = sys.argv  # Save a copy of current sys args
                    print("\nWarning: chromedriver not found. Installing now:")
                    sb_install.main(override="chromedriver")
                    sys.argv = sys_args  # Put back the original sys args
            return webdriver.Chrome(options=chrome_options)
        except Exception as e:
            if headless:
                raise Exception(e)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
            return webdriver.Chrome()
    else:
        raise Exception(
            "%s is not a valid browser option for this system!" % browser_name)
コード例 #4
0
def main():
    
    """
    Test for Yahoo
    """
    """
    url = "https://login.yahoo.com"
    testuser = "******"
    
    options = webdriver.FirefoxOptions()
    options.set_headless(True)
    
    driver = webdriver.Firefox(options=options)
    
    
        
    driver.get(url)
    
    try:
        #element = WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
        with wait_for_page_load(driver):
            driver.find_element_by_id("login-username").send_keys(testuser)
            driver.find_element_by_id("login-signin").click()
        
    except Exception, e:
        sys.stderr.write("Error: " + repr(e) + "\n") 
    
    try:
        errortest = driver.find_element_by_id("username-error")
        if errortest != None:
            print "%s doesn't exist" % testuser
    except NoSuchElementException:
        print "%s exists" %testuser
    
    driver.close()
    """
    
    """
    Test for Lieferando
    """
    url = "https://www.lieferando.de"
    testuser = "******"
    surname = "Maik Kunze"
    testpassword = "******"
    
    options = webdriver.FirefoxOptions()
    options.set_headless(True)
    
    driver = webdriver.Firefox(options=options)
    
    driver.get(url)
    
    """
    try a new Lieferando registration - if account already exists, website gives error message
    """
    try:
    
        driver.find_element_by_xpath("//button[@class='menu button-myaccount userlogin']").click()
        
        wait = WebDriverWait(driver, 10)
        createbutton = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@data-click='register']")))
        createbutton.click()
        
        userfield = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountuser']")))
        userfield.send_keys(testuser)
        
        surnamefield = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountsurname']")))
        surnamefield.send_keys(surname)
        
        pass1field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountpass']")))
        pass1field.send_keys(testpassword)
        
        pass2field = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='iaccountpass2']")))
        pass2field.send_keys(testpassword)
        
        checkagb = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='legal']/label[@class='checkbox-inline']")))
        checkagb.click()
        
        registerbutton = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@id='registerbutton']")))
        registerbutton.click()
                
        # worst case but no other possibility because it comes only html back 
        # for div "userpanel-wrapper", so 3 seconds should be enough to get
        # a AJAX-response
        
        time.sleep(3)
        #driver.implicitly_wait(3)
        
        
        """
        interpret the result: if there exists already an account, then exist
        the following:
            <div id='userpanel-wrapper'>
               <div id='notification'>
               ...
               </div>
               <form id='iaccountsignupform'>
            </form>
            </div>
        
        if the registration was successful, the div 'userpanel-wrapper' doesn't
        contain the div 'notification' and the form 'iaccountsignupform',
        so testing for existence div and form mentioned shows the email-address
        is already registered or not
        """
        try:
            notification = driver.find_element_by_xpath("//div[@id='notification']")
            signupform = driver.find_element_by_xpath("//form[@id='iaccountsignupform']")
            print "Lieferando %s exists" % testuser
            driver.get_screenshot_as_file('/tmp/test.png')
        except NoSuchElementException, e:
            print "Lieferando no account %s " % testuser
        
        
    except NoSuchElementException, e:
        print "Error: %s" % e
コード例 #5
0
def firefox_driver():
    options = webdriver.FirefoxOptions()
    options.add_argument('headless')

    return webdriver.Firefox(options=options)
コード例 #6
0
 def set_firefox_options(self):
     firefox_options = webdriver.FirefoxOptions()
     firefox_options.add_argument("--headless")
     return firefox_options
コード例 #7
0
ファイル: __init__.py プロジェクト: igor112/Python-Beaver
 def init_firefox_driver(self, headless):
     options = webdriver.FirefoxOptions()
     if headless:
         options.add_argument('--headless')
     self.driver = webdriver.Firefox(firefox_options=options)
コード例 #8
0
# =========================================================================

# 获取币种代码
# =========================================================================

# 在工作簿末尾插入新表
wsCodes = wbFX.create_sheet("Codes")
# 顶端标题行
# append 方法将从最后一个被修改过的行(包括写入数据、设置单元格格式、设置行高列宽等)
# 的下一行的最左侧单元格开始,依次将列表中的数据写入该行的各个单元格。
# 每调用一次 append 方法,按上述规则在新的一行写入数据。
wsCodes.append(["币种", "代码"])

# 浏览器无头模式(即不显示浏览器窗口)
profile = webdriver.FirefoxOptions()
profile.add_argument("-headless")
browser = webdriver.Firefox(options=profile)

# 访问页面
# 资料来源:站长之家。
browser.get("http://www.webmasterhome.cn/huilv/huobidaima.asp")
# 隐式等待元素加载:
# 指定时间内元素未加载完毕,则不再等待,代码继续执行;
# 指定时间内元素加载完毕,在元素加载完毕后代码继续执行。
browser.implicitly_wait(5)
# 访问页面并等待页面元素加载完毕后,即可进行网页元素的查找定位。
# 所有币种信息对应元素
elemCurrenciesList = browser.find_elements_by_tag_name("tr")
print("")
print("获取币种代码...")
コード例 #9
0
class ObjectPage(object):

    browser = ReadConfig().getValue(section='browserType', name='browserName')

    chrome_driver_path = ReadConfig().getValue(section='located',
                                               name='chromedriverpath')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('disable-infobars')
    chrome_options.add_argument("headless")
    chrome_options.add_argument(
        'profile.managed_default_content_settings.images')
    chrome_options.add_argument('lang=zh_CN.UTF-8')
    chrome_options.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"'
    )

    firefox_driver_path = ReadConfig().getValue(section='located',
                                                name='firefoxdriverpath')
    firefox_log_path = ReadConfig().getValue(section='located',
                                             name='firefox_log')
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument('--log error')
    firefox_options.add_argument('--headless')
    firefox_options.add_argument('--disable-gpu')  # 禁用GPU加速

    def __init__(self):
        driver = self.getBrowsers()
        try:
            log1.info("-------------------- test start --------------------")
            self.driver = driver
            log1.info("Load Web Driver Success")
        except Exception:
            log1.error("Load Web Driver Fail", exc_info=1)
            self.getImage("Load Web Driver Fail")

    def getBrowsers(self):
        if self.browser == "Chrome":
            self.driver = webdriver.Chrome(
                executable_path=self.chrome_driver_path,
                chrome_options=self.chrome_options)
        elif self.browser == "Firefox":
            self.driver = webdriver.Firefox(
                executable_path=self.firefox_driver_path,
                firefox_options=self.firefox_options,
                log_path=self.firefox_log_path)
        return self.driver

    def getUrl(self, url):
        self.driver.get(url)
        log1.info("Set Url is: " + url)

    def hideWait(self, times):
        self.driver.implicitly_wait(times)
        log1.info("Set Implicitly Wait: " + str(times))

    def maximizeWindow(self):
        self.driver.maximize_window()
        log1.info("Set Browser Max")

    def clearCookies(self):
        self.driver.delete_all_cookies()
        log1.info("Clear All Cookies")

    def refreshBrowser(self):
        self.driver.refresh()
        log1.info("Browser Refresh")

    def getCurrentUrl(self):
        url = self.driver.current_url()
        log1.info("Get Browser Url, The Url is: " + url)
        return url

    @staticmethod
    def isDisplayed(element):
        is_display = element.is_displayed()
        log1.info("Element displayed is: " + is_display)
        return is_display

    @staticmethod
    def sleepWait(times):
        time.sleep(times)
        log1.info("Set Sleep Time is: " + str(times))

    @staticmethod
    def isSelect(element):
        log1.info("Element is Select")
        return element.is_selected()

    def findElement(self, by, value):
        by_map = {
            'id': By.ID,
            'name': By.NAME,
            'class': By.CLASS_NAME,
            'tag': By.TAG_NAME,
            'link': By.LINK_TEXT,
            'plink': By.PARTIAL_LINK_TEXT,
            'css': By.CSS_SELECTOR,
            'xpath': By.XPATH
        }
        if by in by_map.keys():
            try:
                element = WebDriverWait(self.driver,
                                        10,
                                        ignored_exceptions=None).until(
                                            EC.presence_of_element_located(
                                                (by_map[by], value)))
                log1.info(by + " Query Element: " + value)
                return element
            except NoSuchElementException:
                log1.error("Not Found Element Or Timeout", exc_info=1)
                self.getImage("Not Found Element Or Timeout")
        else:
            log1.error(by + " Variable Error", exc_info=1)
            self.getImage(by + " Variable Error")
        # element = None
        # if by in ['id', 'name', 'class', 'tag', 'link', 'plink', 'css', 'xpath']:
        #     try:
        #         if by == 'id':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.ID, value)))
        #             log1.info("Id Query Element: "+value)
        #         elif by == 'name':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.NAME, value)))
        #             log1.info("Name Query Element: "+value)
        #         elif by == 'class':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.CLASS_NAME, value)))
        #             log1.info("Class Name Query Element: "+value)
        #         elif by == 'tag':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.TAG_NAME, value)))
        #             log1.info("Tag Name Query Element: "+value)
        #         elif by == 'link':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.LINK_TEXT, value)))
        #             log1.info("Link Query Element: "+value)
        #         elif by == 'plink':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, value)))
        #             log1.info("Partial Link Query Element: "+value)
        #         elif by == 'css':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.CSS_SELECTOR, value)))
        #             log1.info("Css Query Element: "+value)
        #         elif by == 'xpath':
        #             element = WebDriverWait(self.driver, 10, ignored_exceptions=None).until(
        #                 EC.presence_of_element_located((By.XPATH, value)))
        #             log1.info("Xpath Query Element: "+value)
        #         return element
        #     except NoSuchElementException:
        #         log1.error("Not Found Element Or Timeout", exc_info=1)
        #         self.getImage("Not Found Element Or Timeout")
        #
        # else:
        #     log1.error("Variable Error", exc_info=1)
        #     self.getImage("Variable Error")

    def findElements(self, by, value):
        by_map = {
            'id': By.ID,
            'name': By.NAME,
            'class': By.CLASS_NAME,
            'tag': By.TAG_NAME,
            'link': By.LINK_TEXT,
            'plink': By.PARTIAL_LINK_TEXT,
            'css': By.CSS_SELECTOR,
            'xpath': By.XPATH
        }
        if by in by_map.keys():
            try:
                elements = WebDriverWait(
                    self.driver, 10, ignored_exceptions=None).until(
                        EC.presence_of_all_elements_located(
                            (by_map[by], value)))
                log1.info(by + " Query Element: " + value)
                return elements
            except NoSuchElementException:
                log1.error("Not Found Element Or Timeout", exc_info=1)
                self.getImage("Not Found Element Or Timeout")
        else:
            log1.error(by + " Variable Error", exc_info=1)
            self.getImage(by + " Variable Error")
        # element = None
        # if by in ['id', 'name', 'class', 'tag', 'link', 'plink', 'css', 'xpath']:
        #     try:
        #         if by == 'id':
        #             element = self.driver.find_elements_by_id(value)
        #             log1.info("Id Query Elements "+value)
        #         elif by == 'name':
        #             element = self.driver.find_elements_by_name(value)
        #             log1.info("Name Query Elements "+value)
        #         elif by == 'class':
        #             element = self.driver.find_elements_by_class_name(value)
        #             log1.info("Class Name Query Elements "+value)
        #         elif by == 'tag':
        #             element = self.driver.find_elements_by_tag_name(value)
        #             log1.info("Tag Name Query Elements "+value)
        #         elif by == 'link':
        #             element = self.driver.find_elements_by_link_text(value)
        #             log1.info("Link Query Elements "+value)
        #         elif by == 'plink':
        #             element = self.driver.find_elements_by_partial_link_text(value)
        #             log1.info("Partial Link Query Elements "+value)
        #         elif by == 'css':
        #             element = self.driver.find_elements_by_css_selector(value)
        #             log1.info("Css Query Elements "+value)
        #         elif by == 'xpath':
        #             element = self.driver.find_elements_by_xpath(value)
        #             log1.info("Xpath Query Elements "+value)
        #         log1.info("Found Element")
        #         return element
        #     except NoSuchElementException:
        #         log1.error("Not Found Element Or Timeout", exc_info=1)
        #         self.getImage("Not Found Element Or Timeout")
        # else:
        #     log1.error("Variable Error", exc_info=1)
        #     self.getImage("Variable Error")

    @staticmethod
    def sendKeys(self, element, text):
        element.clear()
        log1.info("Element Clear Text")
        try:
            element.send_keys(text)
            log1.info("Element Input Text: " + text)
        except BaseException:
            log1.error("Not Found Element Or Input Error", exc_info=1)
            self.getImage("Not Found Element Or Input Error")

    def click(self, element):
        try:
            element.click()
            log1.info("Element Click")
        except BaseException:
            if self.isDisplayed(element) is True:
                self.sleepWait(3)
                element.click()
                log1.info("Element Click")
            else:
                log1.error('Not Found Element', exc_info=1)

    def getTitle(self):
        log1.info("Get Title")
        return self.driver.title

    def actionsKeyDown(self):
        ActionChains(self.driver).key_down(Keys.CONTROL).perform()

    def actionsKeyUp(self):
        ActionChains(self.driver).key_up(Keys.CONTROL).perform()

    @staticmethod
    def select(type, element, value):
        try:
            if type == "index":
                Select(element).select_by_index(value)
                log1.info("Select Element Index")
            elif type == "value":
                Select(element).select_by_value(value)
                log1.info("Select Element Value")
            elif type == "text":
                Select(element).select_by_visible_text(value)
                log1.info("Select Element text")
            else:
                log1.error('please input type', exc_info=1)
        except BaseException:
            log1.error('Not Found Element', exc_info=1)

    @staticmethod
    def deselect(type, element, value=""):
        try:
            if type == "index" and value != "":
                Select(element).deselect_by_index(value)
                log1.info("Deselect Element Index")
            elif type == "value" and value != "":
                Select(element).deselect_by_value(value)
                log1.info("Deselect Element Value")
            elif type == "text" and value != "":
                Select(element).deselect_by_visible_text(value)
                log1.info("Deselect Element Text")
            elif type == "all" and value == "":
                Select(element).deselect_all()
                log1.info("Deselect All Element")
            else:
                log1.error('please input type', exc_info=1)
        except BaseException:
            log1.error("Not Found Select")

    @staticmethod
    def getAllSelect(element):
        try:
            log1.info("Get All Select")
            return Select(element).all_selected_options
        except BaseException:
            log1.error("Not Found All Select")

    @staticmethod
    def getAttribute(element, attribute):
        log1.info("Get Element Attribute")
        return element.get_attribute(attribute)

    @staticmethod
    def getText(element):
        log1.info("Get Element Text")
        return element.text

    def getImage(self, imageName):
        img = ReadConfig().getValue(section='located', name='image')
        try:
            self.driver.get_screenshot_as_file(img + imageName + ".png")
            log1.info("Screenshot Image")
        except BaseException:
            log1.error("Screenshot Image Fail", exc_info=1)

    def textAlert(self):
        t = str(self.driver.switch_to.alert.text)
        log1.info("Get Alert Text: " + t)
        self.acceptAlert()
        return t

    def sendKeysAlert(self, text):
        self.driver.switch_to.alert.send_keys(text)
        log1.info("Input Alert Text: " + text)
        self.acceptAlert()

    def acceptAlert(self):
        self.driver.switch_to.alert.accept()
        log1.info("Alert Accept")

    def dismissAlert(self):
        self.driver.switch_to.alert.dismiss()
        log1.info("Alert Dismiss")

    def quitBrowser(self):
        self.sleepWait(3)
        self.driver.quit()
        log1.info("Quit Browser")
        log1.info("-------------------- test end --------------------")

    def closeBrowser(self):
        self.sleepWait(3)
        self.driver.close()
        log1.info("Close Browser")
        log1.info("-------------------- test end --------------------")
コード例 #10
0
def ready(proxy_ip_port=None):
    '''

    :param proxy_ip_port:
    :return:
    '''
    if system == 'windows':
        executable_path = 'D:/geckodriver/geckodriver.exe' if driver_type == 'firefox' else 'D:/geckodriver/chromedriver.exe'
    elif system == 'linux':
        executable_path = '../temp_file/geckodriver' if driver_type == 'firefox' else '../temp_file/chromedriver'
    else:
        logger.warning('没有这个系统:%s,暂时默认为windows' % system)
        executable_path = 'D:/geckodriver/geckodriver.exe' if driver_type == 'firefox' else 'D:/geckodriver/chromedriver.exe'

    if driver_type == 'firefox':  # 如果用火狐
        fireFoxOptions = webdriver.FirefoxOptions()
        fireFoxOptions.set_headless()
        if proxy_ip_port:
            profile = webdriver.FirefoxProfile()
            profile.set_preference("network.proxy.type", 1)
            if proxy_ip_port['proxy_type'] == 'http':
                profile.set_preference(
                    "network.proxy.http",
                    proxy_ip_port['ip_with_port'].split(':')[0])
                profile.set_preference(
                    "network.proxy.http_port",
                    int(proxy_ip_port['ip_with_port'].split(':')[1]))
            else:
                profile.set_preference(
                    'network.proxy.socks',
                    proxy_ip_port['ip_with_port'].split(':')[0])
                profile.set_preference(
                    'network.proxy.socks_port',
                    int(proxy_ip_port['ip_with_port'].split(':')[1]))
            profile.update_preferences()
            driver = webdriver.Firefox(
                executable_path=executable_path,
                firefox_profile=profile,
                options=None if mode == 'debug' else fireFoxOptions)
            # webdriver.Chrome()
        else:
            driver = webdriver.Firefox(
                executable_path=executable_path,
                options=None if mode == 'debug' else fireFoxOptions)

        # if proxy_ip_port:
        #     if proxy_ip_port['proxy_type'] == 'http':
        #         proxy = Proxy({'prox'
        #                        'yType': ProxyType.MANUAL['string'], 'httpProxy': proxy_ip_port['ip_with_port']})
        #     elif proxy_ip_port['proxy_type'] == 'socks5' or proxy_ip_port['proxy_type'] == 'socks4':
        #         proxy = Proxy({'proxyType': ProxyType.MANUAL['string'], 'socksProxy': proxy_ip_port['ip_with_port']})
        #     else:
        #         raise Exception('Has no this kind of proxy:' + proxy_ip_port['proxy_type'])
        # else:
        #     proxy = None
        # if mode == 'product':
        #     driver = webdriver.Firefox(executable_path=executable_path, options=fireFoxOptions,
        #                                proxy=proxy)
        # elif mode == 'debug':
        #     if system == 'windows':
        #         binary_path = 'C:/Program Files (x86)/Mozilla Firefox/firefox.exe'
        #     else:
        #         binary_path = '/usr/bin/firefox'
        #     binary = FirefoxBinary(binary_path)
        #     driver = webdriver.Firefox(firefox_binary=binary,executable_path=executable_path, proxy=proxy)
        # else:
        #     raise Exception('ready方法的参数里面没有这种mode:%s' % mode)

        # PROXY = proxy_ip_port['ip_with_port']
        # webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
        #     # "httpProxy": PROXY,
        #     # "ftpProxy": PROXY,
        #     # "sslProxy": PROXY,
        #     "socksProxy": PROXY,
        #     "noProxy": None,
        #     "proxyType": "MANUAL",
        #     "class": "org.openqa.selenium.Proxy",
        #     "autodetect": False
        # }
        # driver = webdriver.Remote("http://localhost:4444/wd/hub", webdriver.DesiredCapabilities.FIREFOX)
    else:  # 不是火狐就是chrome
        chrome_options = webdriver.ChromeOptions()
        if not mode == 'debug':
            chrome_options.set_headless()
        if proxy_ip_port:
            chrome_options.add_argument(
                '--proxy-server=%s://%s' %
                (proxy_ip_port['proxy_type'], proxy_ip_port['ip_with_port']))
        driver = webdriver.Chrome(
            executable_path='D:/geckodriver/chromedriver.exe',
            chrome_options=chrome_options)
    driver.implicitly_wait(
        10)  # 设置寻找(包括异步加载的)element所等待的时间(如果不设置,则异步加载的element有可能会找不到)
    driver.set_page_load_timeout(10)  # 设置timeout时间
    return driver
コード例 #11
0
from selenium import webdriver
from selenium.webdriver.common.touch_actions import TouchActions
import time
import openpyxl
wx_rul = "https://testmy.orangebank.com.cn/WeiXin/773A50B48B2286170DE9A2D3FF014EAF92D09B8C815C0E0C51741AAA2FD67D" \
             "1B96C10BD04CA2737C73F42DC5CAE1AC6383D4217E902E8DC6C0A7A92A3767427EF47ADAB1B02F2BA828AACA271C56C0E10D12" \
             "D927DE2DDF5252C27C145BF42C38A810BD7CFDC2DAB28005E17A459EA7FF2363ADB5C01BEB85C7CE809B977013162D54FA5826" \
             "A647966ADDF84F3E99B6195E18E2CE916137A91B1C1820539D9D60354C504A15D997C3386D19641DD832DB95715AFF7EA0E8B0" \
             "D77C9625EF6AE9193D42D869233E79BFC031DE443B6B8E717C8CD9B5159F19DF8544AB7EE3F13FCF_bespeakService_1_1.do"

# 给浏览器定义启动参数
firefox_start_options = webdriver.FirefoxOptions()
firefox_start_options.add_argument("mobileEmulation")
# 配置`desired_capabilities`
desired_caps = {}
desired_caps['platform'] = 'WINDOWS'
desired_caps['browserName'] = 'firefox'
desired_caps['deviceName'] = 'iPhone 6'

# 根据上面的配置实例化一个Remote类
remote_firefox = webdriver.Remote('http://192.168.0.122:4444/wd/hub',
                                  desired_caps,
                                  options=firefox_start_options)
remote_firefox.maximize_window()
remote_firefox.get("https://www.baidu.com")
コード例 #12
0
    def test_Beyonic_file(self):
        driver_plain = webdriver.Firefox(executable_path="d:\\geckodriver.exe")
        driver = EventFiringWebDriver(driver_plain, MyListener())
        driver.get("https://apidocs.beyonic.com/")
        options = webdriver.FirefoxOptions()
        options.add_argument('-headless')
        driver.maximize_window()
        sleep(12)
        action = ActionChains(driver)
        driver.find_element(By.ID, "input-search").click()
        action.send_keys("Listing all")
        action.key_down(Keys.ENTER).key_up(Keys.ENTER).perform()
        action.key_down(Keys.ENTER).key_up(Keys.ENTER).perform()
        driver.find_element(By.LINK_TEXT, "Listing all Payments").click()
        element1 = driver.find_element_by_xpath("//*[@href='#listing-all-payments']").text
        print("This will return a", element1)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all Contacts").click()
        element2 = driver.find_element_by_xpath("//*[@href='#listing-all-contacts']").text
        print("This will return a", element2)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all transactions").click()
        element3 = driver.find_element_by_xpath("//*[@href='#listing-all-transactions']").text
        print("This will return a", element3)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all Events").click()
        element4 = driver.find_element_by_xpath("//*[@href='#listing-all-events']").text
        print("This will return a", element4)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all Collections").click()
        element5 = driver.find_element_by_xpath("//*[@href='#listing-all-collections']").text
        print("This will return a", element5)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all webhooks").click()
        element6 = driver.find_element_by_xpath("//*[@href='#listing-all-webhooks']").text
        print("This will return a", element6)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all networks").click()
        element7 = driver.find_element_by_xpath("//*[@href='#listing-all-networks']").text
        print("This will return a", element7)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all accounts").click()
        element8 = driver.find_element_by_xpath("//*[@href='#listing-all-accounts']").text
        print("This will return a", element8)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all currencies").click()
        element9 = driver.find_element_by_xpath("//*[@href='#listing-all-currencies']").text
        print("This will return a", element9)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Listing all Collection Requests").click()
        element10 = driver.find_element_by_xpath("//*[@href='#listing-all-collection-requests']").text
        print("This will return a", element10)
        sleep(2)

        driver.find_element(By.LINK_TEXT, "Collection Requests").click()
        element101 = driver.find_element_by_xpath("//aside[8]").text
        print(element101)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Collections").click()
        element102 = driver.find_element_by_xpath("//aside[9]").text
        print(element102)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Payments").click()
        element103 = driver.find_element_by_xpath("//aside[10]").text
        print(element103)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Currencies").click()
        element104 = driver.find_element_by_xpath("//aside[11]").text
        print(element104)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Networks").click()
        element105 = driver.find_element_by_xpath("//aside[12]").text
        print(element105)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Banks").click()
        element106 = driver.find_element_by_xpath("//aside[13]").text
        print(element106)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Accounts").click()
        element107 = driver.find_element_by_xpath("//aside[14]").text
        print(element107)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Transactions").click()
        element108 = driver.find_element_by_xpath("//aside[15]").text
        print(element108)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Contacts").click()
        element109 = driver.find_element_by_xpath("//aside[16]").text
        print(element109)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Events").click()
        element1010 = driver.find_element_by_xpath("//aside[17]").text
        print(element1010)
        sleep(2)
        driver.find_element(By.LINK_TEXT, "Webhooks").click()
        element1011 = driver.find_element_by_xpath("//aside[18]").text
        print(element1011)
        sleep(2)

        test_output = open("D:/test.txt", "w+")
        test_output.write(element101 + '\n')
        test_output.write(element102 + '\n')
        test_output.write(element103 + '\n')
        test_output.write(element104 + '\n')
        test_output.write(element105 + '\n')
        test_output.write(element106 + '\n')
        test_output.write(element107 + '\n')
        test_output.write(element108 + '\n')
        test_output.write(element109 + '\n')
        test_output.write(element1010 + '\n')
        test_output.write(element1011 + '\n')
        test_output.close()

        f1 = open("D:/test.txt", "r")
        f2 = open("D:/endpoints.txt", "r")
        f3 = open("D:/undocumented_endpoints.txt", "w")

        file1_raw = f1.read()
        file2_raw = f2.read()

        file1_words = file1_raw.split()
        file2_words = file2_raw.split()

        result1 = set(file1_words).difference(file2_words)
        result2 = set(file2_words).difference(file1_words)

        results = result1.union(result2)

        for endpoints in set(results):
            f3.write(endpoints + "\n")
        
        f1.close()
        f2.close()
        f3.close()

        driver.quit()
コード例 #13
0
    def __init__(self, *args, **kwargs):
        """Starts a new local session of Firefox.

        Based on the combination and specificity of the various keyword arguments,
        a capabilities dictionary will be constructed that is passed to the remote end.

        The keyword arguments given to this constructor are helpers to more easily allow Firefox WebDriver sessions
        to be customised with different options.  They are mapped on to a capabilities dictionary that is passed on
        to the remote end.

        As some of the options, such as `firefox_profile` and `options.profile` are mutually exclusive,
        precedence is given from how specific the setting is.  `capabilities` is the least specific keyword argument,
        followed by `options`, followed by `firefox_binary` and `firefox_profile`.

        In practice this means that if `firefox_profile` and `options.profile` are both set,
        the selected profile instance will always come from the most specific variable.
        In this case that would be `firefox_profile`.  This will result in `options.profile` to be ignored because it
        is considered a less specific setting than the top-level `firefox_profile` keyword argument.
        Similarly, if you had specified a `capabilities["moz:firefoxOptions"]["profile"]` Base64 string,
        this would rank below `options.profile`.

         - option_args - [List] of arguments to be added to options (options.add_argument(x))
         - log_level - [Str] Selenium and urllib3 log level (CRITICAL / FATAL / ERROR / WARNING / INFO / DEBUG / NOTSET)
         - launch_attempts - [int] Permitted number of attempts to launch the selenium web driver
        -----------
         - firefox_profile - Instance of ``FirefoxProfile`` object or a string.
           If undefined, a fresh profile will be created in a temporary location on the system.
         - firefox_binary - Instance of ``FirefoxBinary`` or full path to the Firefox binary.
           If undefined, the system default Firefox installation will  be used.
         - timeout - Time to wait for Firefox to launch when using the extension connection.
         - capabilities - Dictionary of desired capabilities.
         - proxy - The proxy settings to us when communicating with Firefox via the extension connection.
         - executable_path - Full path to override which geckodriver binary to use for Firefox 47.0.1 and greater,
           which defaults to picking up the binary from the system path.
         - options - Instance of ``options.Options``.
         - service_log_path - Where to log information from the driver.
         - firefox_options - Deprecated argument for options
         - service_args - List of args to pass to the driver service
         - desired_capabilities - alias of capabilities. In future versions of this library,
           this will replace 'capabilities'. This will make the signature consistent with RemoteWebDriver.
         - log_path - Deprecated argument for service_log_path
         - keep_alive - Whether to configure remote_connection.RemoteConnection to use HTTP keep-alive.
        """
        self.launch_attempts = kwargs.pop('launch_attempts', 2)
        self._platform_release = platform.release()

        if kwargs.get('log_level'):
            self.set_selenium_log_level(kwargs.pop('log_level', self.LOGGER_DEFAULT_LEVEL))

        self.options = kwargs.pop('options', webdriver.FirefoxOptions())

        for arg in kwargs.pop('option_args', []):
            self.options.add_argument(arg)

        logger.info('Launching Chrome webdriver')
        last_exception = None
        for i in range(self.launch_attempts):
            try:
                super().__init__(
                    *args,
                    **{'desired_capabilities' if self._platform_release == 'XP' else 'options': self.options},
                    **kwargs
                )
                break
            except WebDriverException as ex:
                logger.error(ex)
                last_exception = ex
                if type(ex) == SessionNotCreatedException or "executable needs to be in PATH" in str(ex):
                    self.update()
            except Exception as ex:
                logger.error(ex)
                last_exception = ex
        else:
            raise last_exception
        logger.info('Firefox webdriver launched successfully')
コード例 #14
0
def get_local_driver(browser_name, headless, proxy_string, proxy_auth,
                     proxy_user, proxy_pass, user_agent, disable_csp,
                     enable_sync, user_data_dir, extension_zip, extension_dir):
    '''
    Spins up a new web browser and returns the driver.
    Can also be used to spin up additional browsers for the same test.
    '''
    downloads_path = download_helper.get_downloads_folder()
    download_helper.reset_downloads_folder()

    if browser_name == constants.Browser.FIREFOX:
        try:
            try:
                # Use Geckodriver for Firefox if it's on the PATH
                profile = _create_firefox_profile(downloads_path, proxy_string,
                                                  user_agent, disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_capabilities['marionette'] = True
                options = webdriver.FirefoxOptions()
                if headless:
                    options.add_argument('-headless')
                if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER):
                    make_driver_executable_if_not(LOCAL_GECKODRIVER)
                    firefox_driver = webdriver.Firefox(
                        firefox_profile=profile,
                        capabilities=firefox_capabilities,
                        options=options,
                        executable_path=LOCAL_GECKODRIVER)
                else:
                    firefox_driver = webdriver.Firefox(
                        firefox_profile=profile,
                        capabilities=firefox_capabilities,
                        options=options)
            except WebDriverException:
                # Don't use Geckodriver: Only works for old versions of Firefox
                profile = _create_firefox_profile(downloads_path, proxy_string,
                                                  user_agent, disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_capabilities['marionette'] = False
                firefox_driver = webdriver.Firefox(
                    firefox_profile=profile, capabilities=firefox_capabilities)
            return firefox_driver
        except Exception as e:
            if headless:
                raise Exception(e)
            return webdriver.Firefox()
    elif browser_name == constants.Browser.INTERNET_EXPLORER:
        if not IS_WINDOWS:
            raise Exception(
                "IE Browser is for Windows-based operating systems only!")
        from selenium.webdriver.ie.options import Options
        ie_options = Options()
        ie_options.ignore_protected_mode_settings = False
        ie_options.ignore_zoom_level = True
        ie_options.require_window_focus = False
        ie_options.native_events = True
        ie_options.full_page_screenshot = True
        ie_options.persistent_hover = True
        ie_capabilities = ie_options.to_capabilities()
        if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER):
            make_driver_executable_if_not(LOCAL_IEDRIVER)
            return webdriver.Ie(capabilities=ie_capabilities,
                                executable_path=LOCAL_IEDRIVER)
        else:
            return webdriver.Ie(capabilities=ie_capabilities)
    elif browser_name == constants.Browser.EDGE:
        if not IS_WINDOWS:
            raise Exception(
                "Edge Browser is for Windows-based operating systems only!")
        edge_capabilities = DesiredCapabilities.EDGE.copy()
        if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER):
            make_driver_executable_if_not(LOCAL_EDGEDRIVER)
            return webdriver.Edge(capabilities=edge_capabilities,
                                  executable_path=LOCAL_EDGEDRIVER)
        else:
            return webdriver.Edge(capabilities=edge_capabilities)
    elif browser_name == constants.Browser.SAFARI:
        return webdriver.Safari()
    elif browser_name == constants.Browser.OPERA:
        if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER):
            make_driver_executable_if_not(LOCAL_OPERADRIVER)
            return webdriver.Opera(executable_path=LOCAL_OPERADRIVER)
        else:
            return webdriver.Opera()
    elif browser_name == constants.Browser.PHANTOM_JS:
        with warnings.catch_warnings():
            # Ignore "PhantomJS has been deprecated" UserWarning
            warnings.simplefilter("ignore", category=UserWarning)
            return webdriver.PhantomJS()
    elif browser_name == constants.Browser.GOOGLE_CHROME:
        try:
            chrome_options = _set_chrome_options(downloads_path, headless,
                                                 proxy_string, proxy_auth,
                                                 proxy_user, proxy_pass,
                                                 user_agent, disable_csp,
                                                 enable_sync, user_data_dir,
                                                 extension_zip, extension_dir)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
                return webdriver.Chrome(executable_path=LOCAL_CHROMEDRIVER,
                                        options=chrome_options)
            else:
                return webdriver.Chrome(options=chrome_options)
        except Exception as e:
            if headless:
                raise Exception(e)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
                return webdriver.Chrome(executable_path=LOCAL_CHROMEDRIVER)
            else:
                return webdriver.Chrome()
    else:
        raise Exception("%s is not a valid browser option for this system!" %
                        browser_name)
コード例 #15
0
def get_local_driver(browser_name, headless, locale_code, servername,
                     proxy_string, proxy_auth, proxy_user, proxy_pass,
                     user_agent, disable_csp, enable_ws, enable_sync,
                     use_auto_ext, no_sandbox, disable_gpu, incognito,
                     guest_mode, devtools, swiftshader, block_images,
                     user_data_dir, extension_zip, extension_dir,
                     mobile_emulator, device_width, device_height,
                     device_pixel_ratio):
    '''
    Spins up a new web browser and returns the driver.
    Can also be used to spin up additional browsers for the same test.
    '''
    downloads_path = download_helper.get_downloads_folder()
    download_helper.reset_downloads_folder()

    if browser_name == constants.Browser.FIREFOX:
        try:
            try:
                # Use Geckodriver for Firefox if it's on the PATH
                profile = _create_firefox_profile(downloads_path, locale_code,
                                                  proxy_string, user_agent,
                                                  disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_capabilities['marionette'] = True
                options = webdriver.FirefoxOptions()
                if headless:
                    options.add_argument('-headless')
                    firefox_capabilities['moz:firefoxOptions'] = ({
                        'args': ['-headless']
                    })
                if LOCAL_GECKODRIVER and os.path.exists(LOCAL_GECKODRIVER):
                    try:
                        make_driver_executable_if_not(LOCAL_GECKODRIVER)
                    except Exception as e:
                        logging.debug("\nWarning: Could not make geckodriver"
                                      " executable: %s" % e)
                elif not is_geckodriver_on_path():
                    args = " ".join(sys.argv)
                    if not ("-n" in sys.argv or "-n=" in args or args == "-c"):
                        # (Not multithreaded)
                        from seleniumbase.console_scripts import sb_install
                        sys_args = sys.argv  # Save a copy of current sys args
                        print("\nWarning: geckodriver not found!"
                              " Installing now:")
                        try:
                            sb_install.main(override="geckodriver")
                        except Exception as e:
                            print("\nWarning: Could not install geckodriver: "
                                  "%s" % e)
                        sys.argv = sys_args  # Put back the original sys args
                if "linux" in PLATFORM or not headless:
                    firefox_driver = webdriver.Firefox(
                        firefox_profile=profile,
                        capabilities=firefox_capabilities)
                else:
                    firefox_driver = webdriver.Firefox(
                        firefox_profile=profile,
                        capabilities=firefox_capabilities,
                        options=options)
            except Exception:
                profile = _create_firefox_profile(downloads_path, locale_code,
                                                  proxy_string, user_agent,
                                                  disable_csp)
                firefox_capabilities = DesiredCapabilities.FIREFOX.copy()
                firefox_driver = webdriver.Firefox(
                    firefox_profile=profile, capabilities=firefox_capabilities)
            return firefox_driver
        except Exception as e:
            if headless:
                raise Exception(e)
            return webdriver.Firefox()
    elif browser_name == constants.Browser.INTERNET_EXPLORER:
        if not IS_WINDOWS:
            raise Exception(
                "IE Browser is for Windows-based operating systems only!")
        from selenium.webdriver.ie.options import Options
        ie_options = Options()
        ie_options.ignore_protected_mode_settings = False
        ie_options.ignore_zoom_level = True
        ie_options.require_window_focus = False
        ie_options.native_events = True
        ie_options.full_page_screenshot = True
        ie_options.persistent_hover = True
        ie_capabilities = ie_options.to_capabilities()
        if LOCAL_IEDRIVER and os.path.exists(LOCAL_IEDRIVER):
            try:
                make_driver_executable_if_not(LOCAL_IEDRIVER)
            except Exception as e:
                logging.debug("\nWarning: Could not make iedriver"
                              " executable: %s" % e)
        return webdriver.Ie(capabilities=ie_capabilities)
    elif browser_name == constants.Browser.EDGE:
        try:
            chrome_options = _set_chrome_options(
                browser_name, downloads_path, headless, locale_code,
                proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent,
                disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox,
                disable_gpu, incognito, guest_mode, devtools, swiftshader,
                block_images, user_data_dir, extension_zip, extension_dir,
                servername, mobile_emulator, device_width, device_height,
                device_pixel_ratio)
            if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER):
                try:
                    make_driver_executable_if_not(LOCAL_EDGEDRIVER)
                except Exception as e:
                    logging.debug("\nWarning: Could not make edgedriver"
                                  " executable: %s" % e)
            elif not is_edgedriver_on_path():
                args = " ".join(sys.argv)
                if not ("-n" in sys.argv or "-n=" in args or args == "-c"):
                    # (Not multithreaded)
                    from seleniumbase.console_scripts import sb_install
                    sys_args = sys.argv  # Save a copy of current sys args
                    print("\nWarning: msedgedriver not found. Installing now:")
                    sb_install.main(override="edgedriver")
                    sys.argv = sys_args  # Put back the original sys args
            # For Microsoft Edge (Chromium) version 79 or lower
            return webdriver.Chrome(executable_path=LOCAL_EDGEDRIVER,
                                    options=chrome_options)
        except Exception:
            # For Microsoft Edge (Chromium) version 80 or higher
            from msedge.selenium_tools import Edge, EdgeOptions
            if LOCAL_EDGEDRIVER and os.path.exists(LOCAL_EDGEDRIVER):
                try:
                    make_driver_executable_if_not(LOCAL_EDGEDRIVER)
                except Exception as e:
                    logging.debug("\nWarning: Could not make edgedriver"
                                  " executable: %s" % e)
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            prefs = {
                "download.default_directory": downloads_path,
                "local_discovery.notifications_enabled": False,
                "credentials_enable_service": False,
                "download.prompt_for_download": False,
                "download.directory_upgrade": True,
                "safebrowsing.enabled": False,
                "safebrowsing.disable_download_protection": True,
                "profile": {
                    "password_manager_enabled": False,
                    "default_content_setting_values.automatic_downloads": 1
                }
            }
            if locale_code:
                prefs["intl.accept_languages"] = locale_code
            if block_images:
                prefs["profile.managed_default_content_settings.images"] = 2
            edge_options.add_experimental_option("prefs", prefs)
            edge_options.add_experimental_option("w3c", True)
            edge_options.add_experimental_option("useAutomationExtension",
                                                 False)
            edge_options.add_experimental_option(
                "excludeSwitches", ["enable-automation", "enable-logging"])
            if guest_mode:
                edge_options.add_argument("--guest")
            if headless:
                edge_options.add_argument("--headless")
            if mobile_emulator:
                emulator_settings = {}
                device_metrics = {}
                if type(device_width) is int and (
                        type(device_height) is int and
                    (type(device_pixel_ratio) is int)):
                    device_metrics["width"] = device_width
                    device_metrics["height"] = device_height
                    device_metrics["pixelRatio"] = device_pixel_ratio
                else:
                    device_metrics["width"] = 411
                    device_metrics["height"] = 731
                    device_metrics["pixelRatio"] = 3
                emulator_settings["deviceMetrics"] = device_metrics
                if user_agent:
                    emulator_settings["userAgent"] = user_agent
                edge_options.add_experimental_option("mobileEmulation",
                                                     emulator_settings)
                edge_options.add_argument("--enable-sync")
            edge_options.add_argument("--disable-infobars")
            edge_options.add_argument("--disable-save-password-bubble")
            edge_options.add_argument("--disable-single-click-autofill")
            edge_options.add_argument(
                "--disable-autofill-keyboard-accessory-view[8]")
            edge_options.add_argument("--disable-translate")
            if not enable_ws:
                edge_options.add_argument("--disable-web-security")
            edge_options.add_argument("--homepage=about:blank")
            edge_options.add_argument("--dns-prefetch-disable")
            edge_options.add_argument("--dom-automation")
            edge_options.add_argument("--disable-hang-monitor")
            edge_options.add_argument("--disable-prompt-on-repost")
            if proxy_string:
                edge_options.add_argument('--proxy-server=%s' % proxy_string)
            edge_options.add_argument("--test-type")
            edge_options.add_argument("--log-level=3")
            edge_options.add_argument("--no-first-run")
            edge_options.add_argument("--ignore-certificate-errors")
            if devtools and not headless:
                edge_options.add_argument("--auto-open-devtools-for-tabs")
            edge_options.add_argument("--allow-file-access-from-files")
            edge_options.add_argument("--allow-insecure-localhost")
            edge_options.add_argument("--allow-running-insecure-content")
            if user_agent:
                edge_options.add_argument("--user-agent=%s" % user_agent)
            edge_options.add_argument("--no-sandbox")
            if swiftshader:
                edge_options.add_argument("--use-gl=swiftshader")
            else:
                edge_options.add_argument("--disable-gpu")
            if "linux" in PLATFORM:
                edge_options.add_argument("--disable-dev-shm-usage")
            capabilities = edge_options.to_capabilities()
            capabilities["platform"] = ''
            return Edge(executable_path=LOCAL_EDGEDRIVER,
                        capabilities=capabilities)
    elif browser_name == constants.Browser.SAFARI:
        arg_join = " ".join(sys.argv)
        if ("-n" in sys.argv) or ("-n=" in arg_join) or (arg_join == "-c"):
            # Skip if multithreaded
            raise Exception("Can't run Safari tests in multi-threaded mode!")
        safari_capabilities = _set_safari_capabilities()
        return webdriver.Safari(desired_capabilities=safari_capabilities)
    elif browser_name == constants.Browser.OPERA:
        try:
            if LOCAL_OPERADRIVER and os.path.exists(LOCAL_OPERADRIVER):
                try:
                    make_driver_executable_if_not(LOCAL_OPERADRIVER)
                except Exception as e:
                    logging.debug("\nWarning: Could not make operadriver"
                                  " executable: %s" % e)
            opera_options = _set_chrome_options(
                browser_name, downloads_path, headless, locale_code,
                proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent,
                disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox,
                disable_gpu, incognito, guest_mode, devtools, swiftshader,
                block_images, user_data_dir, extension_zip, extension_dir,
                servername, mobile_emulator, device_width, device_height,
                device_pixel_ratio)
            opera_options.headless = False  # No support for headless Opera
            return webdriver.Opera(options=opera_options)
        except Exception:
            return webdriver.Opera()
    elif browser_name == constants.Browser.PHANTOM_JS:
        with warnings.catch_warnings():
            # Ignore "PhantomJS has been deprecated" UserWarning
            warnings.simplefilter("ignore", category=UserWarning)
            return webdriver.PhantomJS()
    elif browser_name == constants.Browser.GOOGLE_CHROME:
        try:
            chrome_options = _set_chrome_options(
                browser_name, downloads_path, headless, locale_code,
                proxy_string, proxy_auth, proxy_user, proxy_pass, user_agent,
                disable_csp, enable_ws, enable_sync, use_auto_ext, no_sandbox,
                disable_gpu, incognito, guest_mode, devtools, swiftshader,
                block_images, user_data_dir, extension_zip, extension_dir,
                servername, mobile_emulator, device_width, device_height,
                device_pixel_ratio)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                try:
                    make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
                except Exception as e:
                    logging.debug("\nWarning: Could not make chromedriver"
                                  " executable: %s" % e)
            elif not is_chromedriver_on_path():
                args = " ".join(sys.argv)
                if not ("-n" in sys.argv or "-n=" in args or args == "-c"):
                    # (Not multithreaded)
                    from seleniumbase.console_scripts import sb_install
                    sys_args = sys.argv  # Save a copy of current sys args
                    print("\nWarning: chromedriver not found. Installing now:")
                    sb_install.main(override="chromedriver")
                    sys.argv = sys_args  # Put back the original sys args
            if not headless or "linux" not in PLATFORM:
                return webdriver.Chrome(options=chrome_options)
            else:  # Running headless on Linux
                try:
                    return webdriver.Chrome(options=chrome_options)
                except Exception:
                    # Use the virtual display on Linux during headless errors
                    logging.debug("\nWarning: Chrome failed to launch in"
                                  " headless mode. Attempting to use the"
                                  " SeleniumBase virtual display on Linux...")
                    chrome_options.headless = False
                    return webdriver.Chrome(options=chrome_options)
        except Exception as e:
            if headless:
                raise Exception(e)
            if LOCAL_CHROMEDRIVER and os.path.exists(LOCAL_CHROMEDRIVER):
                try:
                    make_driver_executable_if_not(LOCAL_CHROMEDRIVER)
                except Exception as e:
                    logging.debug("\nWarning: Could not make chromedriver"
                                  " executable: %s" % e)
            return webdriver.Chrome()
    else:
        raise Exception("%s is not a valid browser option for this system!" %
                        browser_name)
コード例 #16
0
def get_data_for_stock(stock, verbose=False):
    # get_data_for_stock()

    # Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory.
    # Also, a folder called "processed" contains more folders whose names are of the
    # stocks that have already been processed. This is for making it easy to just run the
    # script again if it is stopped (e.g. your PC crashes, you have to kill the script)

    # Input: stock (str) - ticker symbol of a designated stock
    # Output: None

    # Set the Firefox webdriver to run headless in the background
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    if massive_scrape_mode:
        if stock.replace('.', '_') not in os.listdir('processed'):
            os.mkdir('processed/{}'.format(stock.replace(
                '.', '_')))  # Instantiate a folder whose name is {stock}
            # to mark that this stock has already been processed

    driver = webdriver.Firefox(
        options=fireFoxOptions
    )  # Initialize the webdriver instance in the background

    try:
        # Search Reuters for {stock}
        driver.get('https://www.reuters.com/search/news?blob={}'.format(stock))
        time.sleep(2)  # Wait for the page to load

        # Reuters should query the company if they have written articles on it.
        # If they haven't, an error will be thrown and no files will be outputted
        # This line gets the company they queried's element which contains
        # the stock's name and URL to Reuters's page on the stock
        text = driver.find_element_by_xpath(
            '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a'
        ).text

        # {condition} will determine if the stock queried is actually
        # the stock that we're trying to get articles on
        condition = False

        # Reuters will format the element's text in 2 ways:

        # {company name} ({ticker}.{some additional text})
        # e.g.
        # Apple Inc (AAPL.OQ)
        # or
        # {company name} ({ticker})
        # e.g.
        # Alcoa Corp (AA)

        # The following lines of code will filter down the element's text
        # to just get {ticker}
        # e.g.
        # Apple Inc (AAPL.OQ) --> AAPL
        # Aloca Corp (AA)     --> AA
        if '.' in text:  # Check if a period is in the text
            # Check if {ticker}.upper() == {stock}.upper()
            if text[text.find('(') +
                    1:text.find('.')].upper() == stock.upper():
                condition = True  # {condition} = True means that the queried stock is a match
        else:  # If there is no period
            # Check if {ticker}.upper() == {stock}.upper()
            if text[text.find('(') +
                    1:text.find(')')].upper() == stock.upper():
                condition = True  # {condition} = True means that the queried stock is a match

        if condition:  # If {stock} has been found in Reuters, continue
            if verbose:
                print('Stock was found on Reuters.')
            # Click the element's link, going to Reuters's
            driver.find_element_by_xpath(
                '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a'
            ).click()
            time.sleep(0.5)  # Let the stock's Reuters page load

            # Go to the "News" section of the stock's Reuters page
            driver.find_element_by_xpath(
                '/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button'
            ).click()
            time.sleep(5)

            # The next segment will scroll down to the bottom of the "News"
            # page of the stock's Reuters page
            SCROLL_PAUSE_TIME = 0.5  # This is how much time it waits before scrolling to
            # the bottom of the page again

            # Get the last height of the page
            last_height = driver.execute_script(
                "return document.body.scrollHeight")
            it_num = 0
            if verbose:
                print('Scrolling to the bottom of the news page...')
            while True:
                if verbose:
                    if it_num % 10 == 0:
                        print('{} - Scroll: Iteration #{}'.format(
                            stock, it_num))
                    it_num += 1
                # Scroll to the bottom of the page
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")

                # Wait for more content to load
                time.sleep(SCROLL_PAUSE_TIME)

                # Get the current height of the page
                new_height = driver.execute_script(
                    "return document.body.scrollHeight")
                if new_height == last_height:
                    # If the current height of the page is the same as it was before,
                    # break the script because there is no more content to load.
                    break

                last_height = new_height  # Get the previous height of the page
            if verbose:
                print('Scroll completed.')
            i = 1  # Article index number starts at one in the HTML
            datas = []  # Put all of the data in here
            tol = 0  # Tol is the amount of times an error has been thrown (for this stock news page).
            # When it hits 3, it's confirmed that all articles for this stock
            # have been queried.

            # The amount of articles on the page is unknown, so a while loop is
            # used to iterate until no more new articles are found
            if verbose:
                print('Scraping the site...')
            it_num = 0
            while True:
                try:
                    if verbose:
                        if it_num % 10 == 0:
                            print('{} - Scrape: Iteration #{}'.format(
                                stock, it_num))

                        it_num += 1
                    # This is the xpath for the title of the article
                    xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format(
                        i)
                    i += 1
                    # An error will be thrown if there are no more articles left because
                    # the driver won't be able to find the non-existent next article.

                    header = driver.find_element_by_xpath(
                        xpath).text  # Get the article's header text
                    link = driver.find_element_by_xpath(xpath).get_attribute(
                        'href')  # Get the link of the article

                    # The links are processed by a seperate script because
                    # less threads have to be devoted to

                    datas.append([header, link])
                except Exception as e:
                    tol += 1  # Increase the error tally by 1

                    time.sleep(30)

                    # print(datas)
                    if tol >= 2:  # The script will only break if 3 errors in a row are thrown to confirm
                        # it's actually found all the articles
                        break

            datas = pd.DataFrame(datas, columns=[
                'text', 'link'
            ])  # Compile the list of headers and links into a pandas DataFrame
            if verbose == False:
                print('Scraping for further information....')
            links_data = Parallel(1, 'threading', verbose=0)(
                delayed(convert_link_to_data)(link)
                for link in datas['link'].values.tolist())
            links_data = pd.DataFrame(
                links_data, columns=['author', 'publish_date', 'body_text'])

            if massive_scrape_mode == True:
                links_data.to_csv(
                    'reuters_data/{}.csv'.format(stock.replace('.', '_'))
                )  # Export the data to the reuters data folder under the name {stock}.csv
            else:
                pass
        else:
            if verbose:
                print('Stock not found on reuters.')
        # Stop the driver
        driver.quit()
        try:
            if massive_scrape_mode == False:
                return links_data

        except:
            if massive_scrape_mode == False:
                return False

    except Exception as e:
        print(e)
        time.sleep(
            30
        )  # Make this worker wait a bit before killing incase Reuters.com
        # is acting up
        driver.quit()  # The webdriver will be killed upon receiving an error
コード例 #17
0
ファイル: dumper.py プロジェクト: Kahsolt/qzone_mood_dumper
def dump(update=UPDATE_MODE):
    # open browser
    profile = webdriver.FirefoxProfile()
    if HEADLESS: profile.set_preference('permissions.default.image', 2)
    profile.set_preference('browser.migration.version', 9001)
    options = webdriver.FirefoxOptions()
    options.add_argument('log-level=3')  # FATAL
    options.add_argument('--disable-extensions')
    if HEADLESS: options.add_argument('-headless')

    browser = webdriver.Firefox(options=options, firefox_profile=profile)
    browser.implicitly_wait(5)
    wait = WebDriverWait(browser, 10, poll_frequency=1)

    # login
    browser.get(URL_BASE)
    wait.until(cond.frame_to_be_available_and_switch_to_it('login_frame'))
    wait.until(cond.element_to_be_clickable((By.ID, 'switcher_plogin')))
    browser.find_element_by_id('switcher_plogin').click()

    qq, passwd = os.getenv('qq'), os.getenv('passwd')
    if qq and passwd:
        print('[Login] using QQ code and password from envvar')
        browser.find_element_by_id('u').send_keys(qq)
        browser.find_element_by_id('p').send_keys(passwd)
        wait.until(cond.element_to_be_clickable((By.ID, 'login_button')))
        browser.find_element_by_id('login_button').click()
        time.sleep(5)  # await for cookies
    else:
        print('[Login] manual interactive login then press Enter to continue')
        input()

    def getCSRFToken(skey):  # transcribed from qzone source javascript snippet
        hs = 5381
        for i in skey:
            hs += (hs << 5) + ord(i)
        return hs & 0x7fffffff

    cookies = {c.get('name'): c.get('value') for c in browser.get_cookies()}
    token = getCSRFToken(browser.get_cookie('p_skey').get('value'))
    qzonetoken = '6938137d34171f799bd85ccfb42b80474825b4c604adfc9adbfae0bc512241f658514dea51e79051be8f'

    browser.close()
    browser.quit()

    # dump
    http = Session()
    http.headers.update(HEADERS)
    cnt, totcnt, pid = 0, -1, 1
    params = {
        'uin': qq,
        'ftype': 0,
        'sort': 0,
        'pos': 0,
        'num': PAGE_SIZE,
        'replynum': 0,
        'g_tk': token,
        'callback': 'preloadCallback',
        'code_version': 1,
        'format': 'jsonp',
        'need_private_comment': 1,
        'qzonetoken': qzonetoken,
    }
    while pid >= 0:
        if totcnt > 0 and pid * PAGE_SIZE > totcnt: break
        time.sleep(1)
        logging.info('[Dumper] crawling on page %d' % pid)
        params['pos'] = (pid - 1) * PAGE_SIZE
        res = http.get(URL_LIST, params=params, cookies=cookies)

        data = json.loads(res.text[16:-2])
        if totcnt < 0:
            totcnt = data.get('total')
            logging.info('[Dumper] %d mood in total' % totcnt)
        for mood in data.get('msglist'):
            timestamp = datetime.fromtimestamp(mood.get('created_time'))
            if Mood.exists(timestamp=timestamp):
                if update: return
                else: continue

            if mood.get('has_more_con'):  # expand if has_more
                params['tid'] = mood.get('tid')
                params['t1_source'] = mood.get('t1_source')
                params['pos'] = 0  # reset offset
                res = http.get(URL_DETAIL, params=params)
                try:
                    data = json.loads(res.text[16:-2])
                except json.decoder.JSONDecodeError:
                    logging.info(res.text)
            content = data.get('content')
            title = '\n' in content and content.split('\n')[0] or content[:16]

            mood = Mood(title=title, content=content, timestamp=timestamp)
            mood.save()
            logging.info('[Save] %r' % mood)
            cnt += 1

        pid += 1
    logging.info('[Save] updated %d items in total' % cnt)
コード例 #18
0
def get_data_for_stock_lb_base(stock: str, days_to_look_back: int):
    # get_data_for_stock()

    # Takes input "stock" and outputs a {stock}.csv file to the reuters_data directory.
    # Also, a folder called "processed" contains more folders whose names are of the
    # stocks that have already been processed. This is for making it easy to just run the
    # script again if it is stopped (e.g. your PC crashes, you have to kill the script)

    # Input: stock (str) - ticker symbol of a designated stock
    # Output: None

    # Set the Firefox webdriver to run headless in the background
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()

    driver = webdriver.Firefox(
        options=fireFoxOptions
    )  # Initialize the webdriver instance in the background

    try:
        # Search Reuters for {stock}
        driver.get('https://www.reuters.com/search/news?blob={}'.format(stock))
        time.sleep(2)  # Wait for the page to load

        # Reuters should query the company if they have written articles on it.
        # If they haven't, an error will be thrown and no files will be outputted
        # This line gets the company they queried's element which contains
        # the stock's name and URL to Reuters's page on the stock
        text = driver.find_element_by_xpath(
            '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a'
        ).text

        # {condition} will determine if the stock queried is actually
        # the stock that we're trying to get articles on
        condition = False

        # Reuters will format the element's text in 2 ways:

        # {company name} ({ticker}.{some additional text})
        # e.g.
        # Apple Inc (AAPL.OQ)
        # or
        # {company name} ({ticker})
        # e.g.
        # Alcoa Corp (AA)

        # The following lines of code will filter down the element's text
        # to just get {ticker}
        # e.g.
        # Apple Inc (AAPL.OQ) --> AAPL
        # Aloca Corp (AA)     --> AA
        if '.' in text:  # Check if a period is in the text
            # Check if {ticker}.upper() == {stock}.upper()
            if text[text.find('(') +
                    1:text.find('.')].upper() == stock.upper():
                condition = True  # {condition} = True means that the queried stock is a match
        else:  # If there is no period
            # Check if {ticker}.upper() == {stock}.upper()
            if text[text.find('(') +
                    1:text.find(')')].upper() == stock.upper():
                condition = True  # {condition} = True means that the queried stock is a match

        if condition:  # If {stock} has been found in Reuters, continue
            # Click the element's link, going to Reuters's
            driver.find_element_by_xpath(
                '/html/body/div[4]/section[2]/div/div[1]/div[3]/div/div/div/div[1]/a'
            ).click()
            time.sleep(0.5)  # Let the stock's Reuters page load

            # Go to the "News" section of the stock's Reuters page
            driver.find_element_by_xpath(
                '/html/body/div[1]/div/div[3]/div/div/nav/div[1]/div/div/ul/li[2]/button'
            ).click()
            time.sleep(5)

            # The next segment will scroll down to the bottom of the "News"
            # page of the stock's Reuters page
            SCROLL_PAUSE_TIME = 0.5  # This is how much time it waits before scrolling to
            # the bottom of the page again

            # Get the last height of the page
            last_height = driver.execute_script(
                "return document.body.scrollHeight")

            while True:
                # Scroll to the bottom of the page
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")

                # Wait for more content to load
                time.sleep(SCROLL_PAUSE_TIME)

                # Get the current height of the page
                new_height = driver.execute_script(
                    "return document.body.scrollHeight")
                if new_height == last_height:
                    # If the current height of the page is the same as it was before,
                    # break the script because there is no more content to load.
                    break

                last_height = new_height  # Get the previous height of the page
            i = 1  # Article index number starts at one in the HTML
            datas = []  # Put all of the data in here
            tol = 0  # Tol is the amount of times an error has been thrown (for this stock news page).
            # When it hits 3, it's confirmed that all articles for this stock
            # have been queried.

            # The amount of articles on the page is unknown, so a while loop is
            # used to iterate until no more new articles are found
            while True:
                try:
                    # This is the xpath for the title of the article
                    xpath = '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/a'.format(
                        i)
                    # /html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[4]/div/div/time
                    i += 1
                    # An error will be thrown if there are no more articles left because
                    # the driver won't be able to find the non-existent next article.

                    header = driver.find_element_by_xpath(
                        xpath).text  # Get the article's header text
                    link = driver.find_element_by_xpath(xpath).get_attribute(
                        'href')  # Get the link of the article
                    date = driver.find_element_by_xpath(
                        '/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[{}]/div/div/time'
                        .format(i)).text
                    #/html/body/div[1]/div/div[4]/div[1]/div/div/div/div[2]/div[7]/div/div/time
                    datas.append([header, link])

                    # # print(header, link, date)

                    try:
                        units_behind = date[::-1]
                        units_behind = units_behind[units_behind.find(' ') +
                                                    1:][::-1]
                        units_behind = pd.Timedelta(units_behind)
                        if units_behind > pd.Timedelta(
                                '{} days'.format(days_to_look_back)):
                            break

                    except:
                        pass
                    # The links are processed by a seperate script because
                    # less threads have to be devoted to

                except Exception as e:
                    # # print(e)
                    tol += 1  # Increase the error tally by 1
                    time.sleep(30)

                    if tol > 2:  # The script will only break if 3 errors in a row are thrown to confirm
                        # it's actually found all the articles
                        break

            datas = pd.DataFrame(datas, columns=[
                'text', 'link'
            ])  # Compile the list of headers and links into a pandas DataFrame
            # datas.to_csv('reuters_data/{}.csv'.format(stock)) # Export the data to the reuters data folder under the name {stock}.csv

        # Stop the driver
        driver.quit()
        return datas
    except Exception as e:
        time.sleep(
            30
        )  # Make this worker wait a bit before killing incase Reuters.com
        # is acting up
        driver.quit()  # The webdriver will be killed upon receiving an error
        # to save space on RAM
        return np.nan
コード例 #19
0
 def setUp(self):
     options = webdriver.FirefoxOptions()
     options.add_argument('-headless')
     self.driver = webdriver.Firefox(options=options)
コード例 #20
0
class WPSpider(scrapy.Spider):
    name = "washingtonPostCrawler"
    start_urls = [
        'https://www.washingtonpost.com/newssearch/?datefilter=7%20Days&query=abu&sort=Date'
    ]
    base_url = 'https://www.washingtonpost.com/newssearch/?datefilter=7%20Days&query=abu&sort=Date'
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    options = webdriver.ChromeOptions()
    options.binary_location = '/usr/bin/google-chrome-unstable'
    options.add_argument('headless')  #TBD
    options.add_argument('no-sandbox')
    options.add_argument('disable-gpu')
    options.add_argument('disable-dev-shm-usage')
    options.add_argument('window-size=1200x600')
    folder = "./WashingtonPost"
    timestamp = datetime.datetime.now()
    resultsPath = "./WashingtonPost" + "/" + str(timestamp)
    next_page = True
    number_pages = 1
    number_of_search_result = 0

    def __init__(self):

        #self.driver = webdriver.Firefox(firefox_options=self.fireFoxOptions)
        #self.driver = webdriver.Chrome('chromedriver',chrome_options=self.chromeOptions)
        self.driver = webdriver.Chrome(chrome_options=self.options)
        self.driver.implicitly_wait(3)
        self.driver.get(self.base_url)
        # Create folder and subfolder based on command execution timestamp
        try:
            os.makedirs(self.resultsPath)
        except:
            print("Folder existed")

    def parse(self, response):
        # url_selector ='.pb-feed-headline ng-scope a ::attr("href")'
        # for article_url in response.css(url_selector).extract():
        #     yield response.follow(article_url, callback=self.parse_article)

        #driver = self.driver

        #Open the browser to below link
        #print("Top of parse")
        driver = self.driver
        #Deal with the anti-robot page by clicking buttons
        driver.find_element_by_xpath(
            "(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]"
        ).click()
        driver.find_element_by_id("agree").click()
        driver.find_element_by_xpath(
            "(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]"
        ).click()
        # Parse for number of search result and calculate number of pages
        self.number_of_search_result = int(
            driver.find_element_by_xpath(
                ".//span[@class='pb-search-number ng-binding']").text)
        self.number_pages = int(self.number_of_search_result / 20) + 1

        print("\nNumber of search result:%d\n" % self.number_of_search_result)
        print("\nNumber of pages:%d\n" % self.number_pages)

        for current_page in range(self.number_pages):
            #Now enter the search result page

            #Start to parse current page's article via links

            elements = driver.find_elements_by_css_selector('a.ng-binding')
            for elem in elements:
                article_url = elem.get_attribute("href")
                print("Article link:%s" % article_url)
                if (article_url != None and article_url != ""):
                    single_article_driver = webdriver.Chrome(
                        chrome_options=self.options)
                    #single_article_driver = webdriver.Firefox(firefox_options=self.fireFoxOptions)
                    #single_article_driver = webdriver.Chrome('chromedriver',chrome_options=self.chromeOptions)

                    time.sleep(2)

                    single_article_driver.get(article_url)
                    single_article_driver.find_element_by_xpath(
                        "(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]"
                    ).click()
                    single_article_driver.find_element_by_id("agree").click()
                    single_article_driver.find_element_by_xpath(
                        "(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]"
                    ).click()
                    #time.sleep(4)

                    try:

                        title_CSS_SELECTOR = 'div.topper-headline'
                        content_xpath = ".//article[@class='paywall']"
                        #driver.refresh()

                        title = single_article_driver.find_element_by_css_selector(
                            title_CSS_SELECTOR).text

                        content = single_article_driver.find_element_by_xpath(
                            content_xpath).text
                        #content=driver.find_elements_by_xpath(".//p")
                        print('title is:%s' % title)
                        print('content is:%s' % content)

                        yield {
                            'title': ''.join(title),
                            'url': ''.join(article_url),
                            'content': ''.join(content)
                        }
                        self.file_write(title, article_url, content)
                        single_article_driver.close()
                        continue
                    except:
                        #driver.back()
                        single_article_driver.close()
                        print("This is in except!")
                        continue

            #Config the next pages's url

            next_page_url = self.base_url + "&startat=" + str(
                (current_page + 1) * 20)
            print("\nNext page url:%s\n" % next_page_url)
            driver.get(next_page_url)
            time.sleep(5)
            continue

        #while (self.next_page):

        #current_search_result_url = driver.current_url
        #print("Current search result url: %s" %current_search_result_url)

        #driver.get(current_search_result_url)
        # elements = driver.find_elements_by_css_selector('a.ng-binding')
        # for elem in elements:
        #     article_url = elem.get_attribute("href")
        #     print("Article link:%s" %article_url)
        #     if(article_url != None):
        #         #single_article_driver = webdriver.Firefox()
        #         single_article_driver = webdriver.Firefox(firefox_options=self.fireFoxOptions)

        #         time.sleep(2)

        #         single_article_driver.get(article_url)
        #         single_article_driver.find_element_by_xpath("(.//*[normalize-space(text()) and normalize-space(.)='Sign in here'])[1]/following::button[1]").click()
        #         single_article_driver.find_element_by_id("agree").click()
        #         single_article_driver.find_element_by_xpath("(.//*[normalize-space(text()) and normalize-space(.)='I agree'])[1]/following::button[1]").click()
        #         #time.sleep(4)

        #         try:

        #             title_CSS_SELECTOR = 'div.topper-headline'
        #             content_xpath = ".//article[@class='paywall']"
        #             #driver.refresh()

        #             title = single_article_driver.find_element_by_css_selector(title_CSS_SELECTOR).text

        #             content = single_article_driver.find_element_by_xpath(content_xpath).text
        #             #content=driver.find_elements_by_xpath(".//p")
        #             print('title is:%s' %title)
        #             print('content is:%s' %content)

        #             yield {
        #             'title':''.join(title),
        #             'url':''.join(article_url),
        #             'content': ''.join(content)
        #             }
        #             single_article_driver.close()
        #             continue
        #         except:
        #             #driver.back()
        #             single_article_driver.close()
        #             print("This is in except!")
        #             continue

        #driver.get(current_search_result_url)

        #Check whether there is next page
        # try:
        #         #if(driver.find_element_by_xpath(".//li[@ng-if='::directionLinks']")):
        #         self.next_page = True
        #         self.number_search_result_page+=1

        #         next_page_url = self.base_url+ "&startat=" + str((self.number_search_result_page-1)*20)
        #         #print("Next page url:%s" %next_page_url)
        #         driver.get(next_page_url)
        #         time.sleep(5)

        # except:
        #     current_search_result_url = driver.current_url
        #     print("Current search result url: %s" %current_search_result_url)
        #     print("#############Only %d of search result pages" %self.number_search_result_page)
        #     self.next_page = False
        #     break

        # if(next_page):
        #     print("############inside next page function")
        #     next_page.click()
        #     time.sleep(5)
        #     yield response.follow(driver.current_url, callback=self.parse)

        #for article_url in driver.find_elements_by_xpath(".//div[@class='search-result-story__headline']"):
        # for article_url in response.css('.search-result-story__headline a ::attr("href")').extract():
        driver.close()

    def parse_article(self, response):
        content = response.xpath(
            ".//article[@itemprop='articleBody']/descendant::text()").extract(
            )
        yield {'article': ''.join(content)}

    def file_write(self, title, url, content):
        #print("######################file method inside######################")
        fileName = str(title) + ".txt"

        file = open(self.resultsPath + "/" + fileName, "w")
        # Write titile, URL, content into the file
        file.write(str(title) + "\n")
        file.write(str(url) + "\n")
        file.write(str(content))
        file.close()
コード例 #21
0
    def inicializar_webdriver_firefox(self):

        archivo_config_ini = FormatUtils.lector_archivo_ini()
        modo_headless = archivo_config_ini.getboolean('Driver', 'headless')
        mandar_log_a_dev_null = archivo_config_ini.getboolean(
            'Driver', 'log_path_dev_null')
        data_profile = archivo_config_ini.get('Driver', 'data_profile')
        #profile_data = archivo_config_ini.getboolean('Driver', 'data_profile')

        mimeTypes = "application/zip, application/octet-stream, image/jpeg, image/png, image/x-png, " \
                    "application/vnd.ms-outlook, text/html, application/pdf, image/png"

        # ruta para deshabilitar log inecesario del geckodriver
        opciones_firefox = webdriver.FirefoxOptions()
        perfil_firefox = webdriver.FirefoxProfile(data_profile)

        firefox_capabilities = webdriver.DesiredCapabilities().FIREFOX.copy()
        firefox_capabilities.update({
            'acceptInsecureCerts': True,
            'acceptSslCerts': True
        })
        firefox_capabilities['acceptSslCerts'] = True

        # ignora las certificaciones de seguridad, esto solamente se realiza para la experiencia de usuario
        opciones_firefox.add_argument('--ignore-certificate-errors')
        opciones_firefox.accept_insecure_certs = True
        perfil_firefox.accept_untrusted_certs = True
        perfil_firefox.assume_untrusted_cert_issuer = False
        perfil_firefox.set_preference("browser.download.folderList", 2)
        perfil_firefox.set_preference("browser.download.dir",
                                      config_constantes.PATH_CARPETA_DESCARGA)
        perfil_firefox.set_preference(
            "browser.download.manager.showWhenStarting", False)
        perfil_firefox.set_preference("browser.helperApps.neverAsk.saveToDisk",
                                      mimeTypes)
        perfil_firefox.set_preference(
            "browser.download.viewableInternally.enabledTypes", "")
        perfil_firefox.update_preferences()

        opciones_firefox.headless = modo_headless

        if mandar_log_a_dev_null:
            param_log_path = config_constantes.DEV_NULL
        else:
            param_log_path = None

        try:
            webdriver_firefox = webdriver.Firefox(
                executable_path=self.ruta_web_driver,
                firefox_options=opciones_firefox,
                firefox_profile=perfil_firefox,
                capabilities=firefox_capabilities,
                log_path=param_log_path)

        except FileNotFoundError as e:
            print('Sucedio un error al intentar configurar el webdriver: {}'.
                  format(e))
            sys.exit()

        except Exception as e:
            print(
                'Sucedio una excepcion al intentar configurar el webdriver {}'.
                format(e))
            sys.exit()

        return webdriver_firefox
コード例 #22
0
ファイル: clipper.py プロジェクト: samgutentag/safewayClipper
def get_webdriver(which_driver="gecko", headless=False):
    """Initialize web driver.

    Will use the Gecko web driver unless an "chrome" is passed to
    the "which_driver" argument.

    Args:
        which_driver (string): use either gecko or chrome webdriver.
        headless (bool): initialize webdriver in headless mode.

    Returns:
        driver (web driver object): -1 on failure

    Raises:
        Exception: something wrong with web driver

    """
    driver_path = os.path.dirname(os.path.realpath(__file__))

    # Gecko Driver Usage
    if which_driver == "gecko":
        geckodriver = os.path.join(driver_path, "webdrivers",
                                   f"geckodriver-{MIN_GECKO_DRIVER_VERSION}")
        logging.info(f"using webdriver version {MIN_GECKO_DRIVER_VERSION}")
        logging.info(f"webdriver located at: {geckodriver}")

        try:

            if headless:
                logging.info("running headless")

                options = webdriver.FirefoxOptions()
                options.add_argument("-headless")

                logging.info("initializing headless Gecko webdriver")
                driver = webdriver.Firefox(
                    executable_path=geckodriver,
                    firefox_options=options,
                    service_log_path="/dev/null",
                )

                # specify webdriver window resolution, helps clicking
                driver.set_window_size(1440, 900)

            else:

                logging.info("initializing Gecko webdriver")
                driver = webdriver.Firefox(executable_path=geckodriver,
                                           service_log_path="/dev/null")

        except Exception as err:
            logging.debug(err)
            return -1
        logging.info("geckodriver ready.")
        return driver

    # ChromeDriver Usage
    else:
        chromedriver = os.path.join(
            driver_path, "webdrivers",
            f"chromedriver_{MIN_CHROME_DRIVER_VERSION}")

        logging.info(f"using webdriver version {MIN_CHROME_DRIVER_VERSION}")
        logging.info(f"webdriver located at: {chromedriver}")

        try:

            if headless:
                logging.info("running headless")
                options = webdriver.ChromeOptions()
                options.add_argument("headless")
                logging.info("initializing headless Chrome webdriver")
                driver = webdriver.Chrome(chromedriver,
                                          options=options,
                                          service_log_path="/dev/null")

                # specify webdriver window resolution, helps clicking
                driver.set_window_size(1440, 900)

            else:

                logging.info("initializing Chrome webdriver")
                driver = webdriver.Chrome(chromedriver,
                                          service_log_path="/dev/null")

        except Exception as err:
            logging.debug(err)
            return -1

        logging.info("chromedriver ready.")
        return driver
コード例 #23
0
def driver():
    options = webdriver.FirefoxOptions()
    options.add_argument("-Headless")
    driver = webdriver.Firefox(options=options)
    return driver
コード例 #24
0
def working():
    global driver
    f = open(r"xs.txt", "r", encoding='utf-8')
    s = f.read()
    s = s.split()
    ff = open(r"状态.txt", "r", encoding='utf-8-sig')  # 读取状态文本
    aa = ff.read()
    aa = s.index(aa)
    for i in range(len(s) - aa):
        print(eval(s[i + aa]))
        run_log(eval(s[i + aa]))
        fff = open(r"状态.txt", "w", encoding='utf-8')
        fff.write(s[i + aa])
        fff.close()
        dict0 = eval(s[i + aa])
        profile_ = getProfirle()
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        driver = webdriver.Firefox(executable_path="driver\geckodriver", firefox_profile=profile_)
        url = "http://wenshu.court.gov.cn/website/wenshu/181217BMTKHNT2W0/index.html"
        driver.get(url)
        maxWait = 50
        for x in range(maxWait):
            try:
                if ((x + 1) % 10) == 0:
                    driver.refresh()
                a = driver.find_element_by_css_selector(
                    '#_view_1545034775000 > div > div.search-wrapper.clearfix > div.advenced-search')
                a.click()
                find(dict0, driver)
                break
            except Exception as e:
                if x == maxWait - 1:
                    raise ValueError("高级检索") from e
                time.sleep(0.5)
        judgePath = "#_view_1545184311000 > div:nth-child(3) > div.list_title.clearfix > h4 > a"
        all_number = caseNember(driver)[1]
        if all_number <= 600  and all_number > 15:
            set15CasePerPg(driver, maxWait=20)
            caseList(driver)
            getdata(data)
            data.clear()
            pageNember = 1
            Auto_pageturn(pageNember, driver, judgePath, maxWait=20)
            webdriver.Firefox.quit(driver)
            continue
        elif (all_number > 0 and all_number <= 15):
            set15CasePerPg(driver, maxWait=20)
            caseList(driver)
            getdata(data)
            data.clear()
            webdriver.Firefox.quit(driver)
            continue
        elif all_number <= 0:
            webdriver.Firefox.quit(driver)
            continue
        else:
            global area_list
            maxWait = 50
            for x in range(maxWait):
                try:
                    if ((x + 1) % 10) == 0:
                        driver.refresh()
                    area_str = driver.find_element_by_xpath('//*[@id="_view_1545095958000"]/div/div[2]')
                    area_list = re.findall(u'[\u4e00-\u9fa5]+', area_str.text, re.S)
                    print(len(area_list))
                    print(area_list)
                    break
                except Exception as e:
                    if x == maxWait - 1:
                        raise ValueError("地域列表") from e
                    time.sleep(0.5)
            area_dict = {'最高人民法院': '//*[@id="0_anchor"]', '北京市': '//*[@id="100_anchor"]',
                         '天津市': '//*[@id="200_anchor"]', '河北省': '//*[@id="300_anchor"]',
                         '山西省': '//*[@id="400_anchor"]', '内蒙古自治区': '//*[@id="500_anchor"]',
                         '辽宁省': '//*[@id="600_anchor"]', '吉林省': '//*[@id="700_anchor"]',
                         '黑龙江省': '//*[@id="800_anchor"]', '上海市': '//*[@id="900_anchor"]',
                         '江苏省': '//*[@id="A00_anchor"]', '浙江省': '//*[@id="B00_anchor"]',
                         '安徽省': '//*[@id="C00_anchor"]', '福建省': '//*[@id="D00_anchor"]',
                         '江西省': '//*[@id="E00_anchor"]', '山东省': '//*[@id="F00_anchor"]',
                         '河南省': '//*[@id="G00_anchor"]', '湖北省': '//*[@id="H00_anchor"]',
                         '湖南省': '//*[@id="I00_anchor"]', '广东省': '//*[@id="J00_anchor"]',
                         '广西壮族自治区': '//*[@id="K00_anchor"]', '海南省': '//*[@id="L00_anchor"]',
                         '重庆市': '//*[@id="M00_anchor"]', '四川省': '//*[@id="N00_anchor"]',
                         '贵州省': '//*[@id="O00_anchor"]', '云南省': '//*[@id="P00_anchor"]',
                         '西藏自治区': '//*[@id="Q00_anchor"]', '陕西省': '//*[@id="R00_anchor"]',
                         '甘肃省': '//*[@id="S00_anchor"]', '青海省': '//*[@id="T00_anchor"]',
                         '宁夏回族自治区': '//*[@id="U00_anchor"]', '新疆维吾尔自治区': '//*[@id="V00_anchor"]',
                         '新疆维吾尔自治区高级人民法院生产建设兵团分院': '//*[@id="X00_anchor"]'}

            farea = open('地域.txt', 'r', encoding='utf-8')
            areaname = farea.read()
            if areaname == 'area':
                bb = 0
            else:
                bb = area_list.index(areaname)
            for area in range(len(area_list) - bb):
                global k1
                maxWait = 50
                for x in range(maxWait):
                    try:
                        if ((x + 1) % 10) == 0:
                            driver.refresh()
                        area_ss = area_dict[area_list[area + bb]]
                        area_c = driver.find_element_by_xpath(area_ss)
                        time.sleep(0.5)
                        area_c.click()
                        time.sleep(0.5)
                        k1, part_number = caseNember(driver)
                        if part_number >= all_number:
                            continue
                        else:
                            break
                    except Exception as e:
                        if x == maxWait - 1:
                            raise ValueError("地域点击") from e
                        time.sleep(0.5)
                if area == (len(area_list)-bb-1):
                    farea = open('地域.txt', 'w', encoding='utf-8')
                    farea.write('area')
                    farea.close()
                else:
                    farea = open('地域.txt', 'w', encoding='utf-8')
                    farea.write(str(area_list[area + bb]))
                    farea.close()
                if k1 == 1:
                    docodition(dict0)
                    # webdriver.Firefox.quit(driver)
                    continue
                if k1 == 2:
                    wpagenumber(1)
                    set15CasePerPg(driver, maxWait=20)
                    caseList(driver)
                    getdata(data)
                    data.clear()
                    wpagenumber(0)
                    docodition(dict0)
                    # webdriver.Firefox.quit(driver)
                    continue
                else:
                    if bb == 0:
                        set15CasePerPg(driver, maxWait=20)
                        wpagenumber(1)
                        caseList(driver)
                        getdata(data)
                        data.clear()
                        pageNember = 1
                        Auto_pageturn(pageNember, driver, judgePath, maxWait=20)
                        docodition(dict0)
                    else:
                        g = open(r"页码.txt", "r", encoding='utf-8-sig')
                        ym = g.read()
                        if ym != '0':
                            pageNember = int(ym)
                            set15CasePerPg(driver, maxWait=20)
                            tiaoye(pageNember, driver, ym)
                            caseList(driver)
                            getdata(data)
                            data.clear()
                            Auto_pageturn(pageNember, driver, judgePath, maxWait=20)
                            docodition(dict0)
                            # webdriver.Firefox.quit(driver)
                            continue
                        else:
                            set15CasePerPg(driver, maxWait=20)
                            wpagenumber(1)
                            caseList(driver)
                            getdata(data)
                            data.clear()
                            pageNember = 1
                            Auto_pageturn(pageNember, driver, judgePath, maxWait=20)
                            docodition(dict0)
        webdriver.Firefox.quit(driver)
コード例 #25
0
def _set_firefox_options(downloads_path, headless, locale_code, proxy_string,
                         user_agent, disable_csp):
    options = webdriver.FirefoxOptions()
    options.accept_untrusted_certs = True
    options.set_preference("reader.parse-on-load.enabled", False)
    options.set_preference("pdfjs.disabled", True)
    options.set_preference("app.update.auto", False)
    options.set_preference("app.update.enabled", False)
    options.set_preference("app.update.silent", True)
    options.set_preference("browser.formfill.enable", False)
    options.set_preference("browser.privatebrowsing.autostart", True)
    options.set_preference("devtools.errorconsole.enabled", True)
    options.set_preference("dom.webnotifications.enabled", False)
    options.set_preference("dom.disable_beforeunload", True)
    options.set_preference("browser.contentblocking.database.enabled", False)
    options.set_preference("extensions.allowPrivateBrowsingByDefault", True)
    options.set_preference("extensions.PrivateBrowsing.notification", False)
    options.set_preference("extensions.systemAddon.update.enabled", False)
    options.set_preference("extensions.update.autoUpdateDefault", False)
    options.set_preference("extensions.update.enabled", False)
    options.set_preference("extensions.update.silent", True)
    options.set_preference("datareporting.healthreport.logging.consoleEnabled",
                           False)
    options.set_preference("datareporting.healthreport.service.enabled", False)
    options.set_preference("datareporting.healthreport.service.firstRun",
                           False)
    options.set_preference("datareporting.healthreport.uploadEnabled", False)
    options.set_preference("datareporting.policy.dataSubmissionEnabled", False)
    options.set_preference("datareporting.policy.dataSubmissionPolicyAccepted",
                           False)
    options.set_preference("toolkit.telemetry.unified", False)
    if proxy_string:
        socks_proxy = False
        socks_ver = 0
        chunks = proxy_string.split(':')
        if len(chunks) == 3 and (chunks[0] == "socks4"
                                 or chunks[0] == "socks5"):
            socks_proxy = True
            socks_ver = int(chunks[0][5])
            if chunks[1].startswith("//") and len(chunks[1]) > 2:
                chunks[1] = chunks[1][2:]
            proxy_server = chunks[1]
            proxy_port = chunks[2]
        else:
            proxy_server = proxy_string.split(':')[0]
            proxy_port = proxy_string.split(':')[1]
        options.set_preference("network.proxy.type", 1)
        if socks_proxy:
            options.set_preference('network.proxy.socks', proxy_server)
            options.set_preference('network.proxy.socks_port', int(proxy_port))
            options.set_preference('network.proxy.socks_version', socks_ver)
        else:
            options.set_preference("network.proxy.http", proxy_server)
            options.set_preference("network.proxy.http_port", int(proxy_port))
            options.set_preference("network.proxy.ssl", proxy_server)
            options.set_preference("network.proxy.ssl_port", int(proxy_port))
    if user_agent:
        options.set_preference("general.useragent.override", user_agent)
    options.set_preference("security.mixed_content.block_active_content",
                           False)
    if settings.DISABLE_CSP_ON_FIREFOX or disable_csp:
        options.set_preference("security.csp.enable", False)
    options.set_preference("browser.download.manager.showAlertOnComplete",
                           False)
    if headless and "linux" not in PLATFORM:
        options.add_argument("--headless")
    if locale_code:
        options.set_preference("intl.accept_languages", locale_code)
    options.set_preference("browser.shell.checkDefaultBrowser", False)
    options.set_preference("browser.startup.page", 0)
    options.set_preference("browser.download.panel.shown", False)
    options.set_preference("browser.download.animateNotifications", False)
    options.set_preference("browser.download.dir", downloads_path)
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.helperApps.alwaysAsk.force", False)
    options.set_preference("browser.download.manager.showWhenStarting", False)
    options.set_preference(
        "browser.helperApps.neverAsk.saveToDisk",
        ("application/pdf, application/zip, application/octet-stream, "
         "text/csv, text/xml, application/xml, text/plain, "
         "text/octet-stream, application/x-gzip, application/x-tar "
         "application/"
         "vnd.openxmlformats-officedocument.spreadsheetml.sheet"))
    return options
コード例 #26
0
 def __init__(self, **kwargs):
     scrapy.Spider.__init__(self, **kwargs)
     options = webdriver.FirefoxOptions()
     #options.add_argument('--headless')
     self.driver = webdriver.Firefox(firefox_options=options)
コード例 #27
0
ファイル: textnow_sms.py プロジェクト: chickmy/py_scripts-1
    def send_text(self):

        #profile = webdriver.FirefoxProfile()
        #proxy = '127.0.0.1:10808'
        #ip, port = proxy.split(":")
        #port = int(port)
        ## 不使用代理的协议,注释掉对应的选项即可
        #settings = {
        #  'network.proxy.type': 1,
        #  'network.proxy.http': ip,
        #  'network.proxy.http_port': port,
        #  'network.proxy.ssl': ip,  # https的网站,
        #  'network.proxy.ssl_port': port,
        #}
        #
        ## 更新配置文件
        #for key, value in settings.items():
        #    profile.set_preference(key, value)
        #profile.update_preferences()
        #
        options = webdriver.FirefoxOptions()
        options.add_argument('-headless')  # 无头参数

        #https://sites.google.com/a/chromium.org/chromedriver/home
        #driver = webdriver.Chrome(r'C:/Python27/Scripts/chromedriver')

        #https://github.com/mozilla/geckodriver/releases
        driver = webdriver.Firefox(executable_path='geckodriver',
                                   options=options)
        #driver = webdriver.Firefox(firefox_profile=profile, options=options)
        #driver = webdriver.Firefox(proxy = proxy)

        #这两种设置都进行才有效
        #driver.set_page_load_timeout(5)
        #driver.set_script_timeout(5)

        try:
            driver.get(self.url)
        except:
            pass
        #强制等待8s,主要是等待reCaptcha加载
        time.sleep(8)

        # 分辨率 1920*1080
        driver.set_window_size(1920, 1080)
        time.sleep(3)

        #presence_of_element_located: 当我们不关心元素是否可见,只关心元素是否存在在页面中。
        #visibility_of_element_located: 当我们需要找到元素,并且该元素也可见。

        WebDriverWait(driver, 3).until(
            EC.visibility_of_element_located(
                (By.XPATH, "//input[@name='username']")))
        uname_box = driver.find_element_by_xpath("//input[@name='username']")
        pass_box = driver.find_element_by_xpath("//input[@name='password']")
        uname_box.send_keys(self.TN_USER)
        pass_box.send_keys(self.TN_PASS)

        login_btn = driver.find_element_by_xpath("//button[@type='submit']")
        login_btn.click()

        #显性等待,每隔3s检查一下条件是否成立
        try:
            WebDriverWait(driver, 3).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//button[@id='newText']")))
        except:
            pass

        print(u'登录成功')
        # 隐性等待,最长等待30秒
        driver.implicitly_wait(30)

        # remove通知提示框
        driver.execute_script(
            "document.querySelectorAll('#recent-header .toast-container').forEach(function(e,i){console.log(e.href)})"
        )
        time.sleep(1)

        driver.execute_script(
            "document.querySelectorAll('.notification-priming-modal').forEach(function(e,i){console.log(e.href)})"
        )
        time.sleep(1)
        driver.execute_script("$('#recent-header .toast-container').remove();")
        driver.execute_script("$('.notification-priming-modal').remove();")
        driver.execute_script("$('.modal').remove();")
        time.sleep(2)

        for phone in self.PHONE_NUMBER.split(','):
            try:

                print(u'开始给%s发短信' %
                      (phone.replace(''.join(list(phone)[-4:]), '****')))

                #点击 新建短信按钮
                try:
                    new_text_btn = driver.find_element_by_id("newText")
                    if new_text_btn.is_displayed():
                        new_text_btn.click()
                    else:
                        driver.execute_script("arguments[0].scrollIntoView();",
                                              new_text_btn)
                        if new_text_btn.is_displayed():
                            new_text_btn.click()
                        else:
                            driver.execute_script("$(arguments[0]).click()",
                                                  "#newText")
                except:
                    driver.execute_script("$(arguments[0]).click()",
                                          "#newText")

                time.sleep(2)

                #输入:短信内容
                try:
                    text_field = driver.find_element_by_id("text-input")
                    if text_field.is_displayed():
                        text_field.click()
                        text_field.send_keys(self.MESSAGE)
                    else:
                        driver.execute_script("arguments[0].scrollIntoView();",
                                              text_field)
                        if text_field.is_displayed():
                            text_field.click()
                            text_field.send_keys(self.MESSAGE)
                        else:
                            driver.execute_script(
                                "$(arguments[0]).val('arguments[1]')",
                                "#text-input", self.MESSAGE)
                except:
                    driver.execute_script(
                        "$(arguments[0]).val('arguments[1]')", "#text-input",
                        self.MESSAGE)
                time.sleep(2)

                #输入号码
                try:
                    number_field = driver.find_element_by_class_name(
                        "newConversationTextField")
                    if number_field.is_displayed():
                        number_field.send_keys(phone)
                    else:
                        driver.execute_script("arguments[0].scrollIntoView();",
                                              number_field)
                        if number_field.is_displayed():
                            number_field.send_keys(phone)
                        else:
                            driver.execute_script(
                                "$(arguments[0]).val('arguments[1]')",
                                ".newConversationTextField", phone)
                except:
                    driver.execute_script(
                        "$(arguments[0]).val('arguments[1]')",
                        ".newConversationTextField", phone)
                time.sleep(10)

                #点击短信内容
                try:
                    text_field = driver.find_element_by_id("text-input")
                    if text_field.is_displayed():
                        text_field.click()
                    else:
                        driver.execute_script("arguments[0].scrollIntoView();",
                                              text_field)
                        if text_field.is_displayed():
                            text_field.click()
                        else:
                            driver.execute_script("$(arguments[0]).focus()",
                                                  "#text-input")
                except:
                    driver.execute_script("$(arguments[0]).focus()",
                                          "#text-input")
                time.sleep(5)

                #点击发送按钮
                try:
                    send_btn = driver.find_element_by_id("send_button")
                    if send_btn.is_displayed():
                        send_btn.click()
                    else:
                        driver.execute_script("arguments[0].scrollIntoView();",
                                              send_btn)
                        if send_btn.is_displayed():
                            send_btn.click()
                        else:
                            driver.execute_script("$(arguments[0]).click()",
                                                  "#send_button")
                            driver.execute_script(
                                "setTimeout($(arguments[0]).click,2000)",
                                "#send_button")
                except:
                    driver.execute_script("$(arguments[0]).click()",
                                          "#send_button")
                    driver.execute_script(
                        "setTimeout($(arguments[0]).click,2000)",
                        "#send_button")
                time.sleep(5)

                #注销账号
                driver.execute_script('window.location.href="/logout"')
                time.sleep(10)

                #执行页面刷新
                #try:
                #  driver.get(self.url.replace('/login','/messaging'))
                #
                #  time.sleep(10)
                #  # 隐性等待,最长等待30秒
                #  driver.implicitly_wait(30)
                #  WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.XPATH, "//button[@id='newText']")))
                #  print (u'刷新页面完成')
                #except:
                #    pass

            except:
                print(u'给%s发短信时发生异常:' % phone)
                info = sys.exc_info()
                #print(info)
                #print(info[0])
                print(info[1])
                time.sleep(2)
                pass
            continue

        print(u'处理完毕---end')

        driver.close()
コード例 #28
0
    def process_request(self, request, spider):

        data = []
        url = request.url

        # webdriver setting
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        # options.add_argument('--proxy-server=%s' % request.meta["proxy"])
        options.add_argument('--user-agent=%s' % request.headers["User-Agent"])

        # webdriver request
        driver = webdriver.Firefox(executable_path=GECKODRIVER,
                                   firefox_options=options,
                                   firefox_binary=FIREFOX_BINARY)
        driver.set_window_size(1440, 800)
        driver.delete_all_cookies()
        driver.get(url)
        loguru.logger.info("Hold URL {url}".format(url=url))
        data.append(driver.page_source)
        # clean popup
        try:
            popup_xpath = (
                './/div[@class = "cmc-cookie-policy-banner__close"]')
            popup_element = WebDriverWait(driver, 60).until(
                EC.element_to_be_clickable((By.XPATH, popup_xpath)))
            loguru.logger.warning(popup_element.text)
            popup_element.click()
            time.sleep(5)

            # Crawl from 3 years ago start from Nov 17 2017 to now
            #Set start date
            start_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[3]/div/ul[2]/li[5]/div/div/div[1]/div/div/span/span/div/div[1]/input'
                )))

            start_element.send_keys(Keys.BACKSPACE * 12)
            start_element.send_keys("Nov 17, 2017")
            start_element.send_keys(Keys.RETURN)
            time.sleep(5)

            # Set end date
            end_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[3]/div/ul[2]/li[5]/div/div/div[1]/div/div/span/span/div/div[2]/input'
                )))

            end_element.send_keys(Keys.BACKSPACE * 12)
            end_element.send_keys("Oct 18, 2020")
            # end_element.send_keys(Keys.RETURN)

            time.sleep(5)
            #driver.refresh()
            url = driver.current_url
            loguru.logger.info("Hold URL {url}".format(url=url))

            data.append(driver.page_source)
        except:
            driver.quit()

        return scrapy.http.HtmlResponse(url=url,
                                        status=200,
                                        body=json.dumps(data).encode('utf-8'),
                                        encoding='utf-8')
コード例 #29
0
from selenium import webdriver

from fake_useragent import UserAgent

import time

# fake_user agent
user_agent = UserAgent()

# options
options = webdriver.FirefoxOptions()

# user-agent
options.set_preference("general.useragent.override",
                       f"user-agent={user_agent.random}")

# disable webdriver mode
options.set_preference("dom.webdriver.enabled", False)

driver = webdriver.Firefox(
    executable_path="/home/cain/PycharmProjects/selenium_python/"
    "firefoxdriver/geckodriver",
    options=options)

# "C:\\users\\selenium_python\\chromedriver\\chromedriver.exe"
# r"C:\users\selenium_python\chromedriver\chromedriver.exe"

try:
    driver.get("https://intoli.com/blog/not-possible-to-block-chrome-headless/"
               "chrome-headless-test.html")
    time.sleep(10)
コード例 #30
0
def extract_play(company,
                 headers,
                 max_results=None,
                 headless=False,
                 phantom=False,
                 gchrome=False,
                 time_sleep=1):
    if gchrome:
        chrome_options = webdriver.ChromeOptions()
        if headless:
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome('chromedriver', options=chrome_options)
    elif phantom:
        driver = webdriver.PhantomJS()
        driver.set_window_size(1120, 550)
    else:
        options = webdriver.FirefoxOptions()
        if headless:
            options.add_argument('-headless')
        driver = webdriver.Firefox(firefox_options=options)
    url = "https://play.google.com/store/apps/details?id=" + company + "&showAllReviews=true"
    driver.get(url)
    selector = "h3 + div > div"
    records = []
    last_elems = 0
    saved = 0
    elems = driver.find_elements_by_css_selector(selector)
    num_elems = len(elems)
    while num_elems > last_elems:
        print(num_elems)
        num_it = -1 * (num_elems - last_elems)
        last_elems = num_elems
        for elem in elems[num_it:]:
            record = {}
            spans = elem.find_elements_by_css_selector('span')
            try:
                comment = spans[12].text
            except:
                comment = ''
            comment_index = 12 if comment else 13
            if not (spans is not None and len(spans) > comment_index):
                comment_obj = None
            else:
                comment_obj = spans[comment_index]
                try:
                    comment = spans[comment_index].text
                except:
                    comment = ''
                buttons = comment_obj.find_elements_by_css_selector(
                    'div > button')
                if len(buttons) > 0:
                    click_element(driver, buttons[0])
                    time.sleep(0.1)
                    spans = elem.find_elements_by_css_selector('span')
                    comment_obj = spans[comment_index + 1]
            record["author"] = spans[0].text
            record["date"] = spans[2].text
            record["review"] = comment
            try:
                stars = elem.find_element_by_css_selector(
                    'div[aria-label][role="img"]')
                record["rating"] = stars.get_attribute("aria-label")
            except:
                record["rating"] = ''
            try:
                record["vote_count"] = elem.find_element_by_css_selector(
                    'div[aria-label="Number of times this review was rated helpful"]'
                ).text
            except:
                record["vote_count"] = ''
            if comment_obj is not None:
                siblings = comment_obj.find_elements_by_xpath('../../*')
                record['reply'] = siblings[2].text if len(siblings) > 2 else ''
            else:
                record['reply'] = ''
            row = []
            for header in headers:
                row.append(record[header])
            writer.writerow([unicode(s).encode("utf-8") for s in row])
            #records.append(record)
            saved += 1
        #df = pandas.DataFrame(records)
        #df.to_csv(output, encoding='utf-8')
        if max_results is not None and saved >= max_results:
            break
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(time_sleep)
        elems = driver.find_elements_by_css_selector(selector)
        num_elems = len(elems)
        if num_elems == last_elems:
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(time_sleep)
            elems = driver.find_elements_by_css_selector(selector)
            num_elems = len(elems)
            if num_elems == last_elems:
                buttons = driver.find_elements_by_css_selector(
                    'h3 + div + div > div[role="button"]')
                if len(buttons) > 0:
                    click_element(driver, buttons[0])
                    time.sleep(1)
                    elems = driver.find_elements_by_css_selector(selector)
                    num_elems = len(elems)
    driver.close()