コード例 #1
0
 def setUp(self):
     options = EdgeOptions()
     options.use_chromium = True
     options.binary_location = "C:\\Program Files (x86)\\Microsoft\\Edge Dev\\Application\\msedge.exe"
     dir = os.path.dirname(os.path.realpath(__file__))
     edge_driver_path = dir + "\\edgedriver_win64\\msedgedriver.exe"
     self.driver = Edge(options=options, executable_path=edge_driver_path)
     self.driver.implicitly_wait(30)
     self.driver.maximize_window()
     self.driver.get("http://localhost:4200")
コード例 #2
0
def __edge_driver(browser_options):
    edge_options = EdgeOptions()
    edge_options.use_chromium = True
    if browser_options.headless:
        edge_options.add_argument('headless')
    if browser_options.browser_binary_location:
        edge_options.binary_location = browser_options.browser_binary_location
    if browser_options.operating_system:
        edge_options.set_capability('platform', 'LINUX')
    if browser_options.webdriver_location:
        return Edge(options=edge_options,
                    executable_path=browser_options.webdriver_location)
    return EdgeDriver.WebDriver(options=edge_options)
コード例 #3
0
    def spider_opened(self, spider):
        #  # 在scrapy中创建driver对象,尽可能少的创建该对象。
        #  # 1. 在初始化方法中创建driver对象;
        #  # 2. 在open_spider中创建driver对象;
        #  # 3. 不要将driver对象的创建放在process_request();
        options = EdgeOptions()

        # # 使用谷歌内核(加了反而报错,说chrome is not reachable,并且此时driver名字必须为msedgedriver.exe,正常应该必须是MicrosoftWebDriver.exe)
        # options.use_chromium = True

        # 浏览器可执行文件绝对路径 - 手动指定使用的浏览器位置
        options.binary_location = r"MicrosoftWebDriver.exe"

        # options.add_argument("--remote-debugging-port=59692")

        # # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
        # options.headless = True
        # options.add_argument("--headless")

        # “–no-sandbox”参数是让Chrome在root权限下跑
        # options.add_argument('--no-sandbox')

        # options.add_argument('--disable-dev-shm-usage')

        # 谷歌文档提到需要加上这个属性来规避bug
        # options.add_argument("disable-gpu")

        # 隐私模式
        # options.add_argument("-inprivate")

        options.add_argument(
            "user-data-dir=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data"
        )

        options.add_argument(
            "profile-directory=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default"
        )

        # options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"

        self.driver = Edge(options=options)

        # give time to login manually
        self.driver.get('https://i.cnblogs.com/posts?pageSize=100')
        time.sleep(30)

        spider.logger.info('Spider opened: %s' % spider.name)
コード例 #4
0
def img_download(url, path, count):
    # get all images
    chrome_driver_path = "D:\\programming\\Machine learning\\ml_projects\\google image scraper\\msedgedriver.exe"
    browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe"
    option = EdgeOptions()
    option.binary_location = browser_path
    driver = Edge(executable_path=chrome_driver_path, options=option)
    try:
        driver.get(url)
        #time.sleep(10)
        for __ in range(10):
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(.2)
        imgs = get_all_images(url, driver)
        for img in imgs:
            # for each img, download it
            count = download(img, path, count)
    except WebDriverException:
        print("page down")
    return count
コード例 #5
0
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import TimeoutException
import os, datetime, csv
import pickle

driverOptions = EdgeOptions()
driverOptions.use_chromium = True
driverOptions.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
driverOptions.add_argument("--headless")
driverOptions.add_experimental_option("excludeSwitches", ["enable-logging"])
browser = Edge(options=driverOptions)

try:
    try:
        browser.get('https://www.netztest.at/')
        cookies = pickle.load(
            open(os.path.join(os.path.dirname(__file__), 'cookies.pkl'), "rb"))

        for cookie in cookies:
            if cookie['name'] == 'RMBTuuid':
                cookie['expiry'] = int(
                    (datetime.datetime.utcnow().replace(
                        hour=0, minute=0, second=0, microsecond=0) -
                     datetime.datetime(1970, 1, 1)).total_seconds() + 1209600)
                browser.add_cookie(cookie)
                print("added cookie ", cookie)
    except Exception:
コード例 #6
0
def get_browser(_config):
    """
    获取浏览器对象
    :return:
    """
    browser_type = _config['selenium']['browserType']
    headless = _config['selenium']['headless']
    binary = _config['selenium']['binary']
    user_agent = _config['user-agent'][0]
    try:
        if browser_type == 'Chrome':
            chrome_options = webdriver.ChromeOptions()
            # 防止在某些情况下报错`
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_experimental_option(
                "excludeSwitches", ['enable-automation', 'enable-logging'])
            chrome_options.add_argument(f'user-agent={user_agent}')
            if binary != "":
                # 当找不到浏览器时需要在 config 里配置路径
                chrome_options.binary_location = binary
            if headless:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser = webdriver.Chrome(
                    executable_path=get_file("./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            elif sys.platform == 'darwin':
                _browser = webdriver.Chrome(
                    executable_path=get_file("./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            elif sys.platform == 'win32':
                _browser = webdriver.Chrome(
                    executable_path=get_file("./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            _browser.set_window_size(500, 700)
        elif browser_type == 'Edge':
            from msedge.selenium_tools import Edge, EdgeOptions
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            edge_options.add_argument('--no-sandbox')
            edge_options.add_argument('--disable-dev-shm-usage')
            edge_options.add_experimental_option(
                "excludeSwitches", ['enable-automation', 'enable-logging'])
            if binary != "":
                edge_options.binary_location = binary
            if headless:
                edge_options.add_argument('--headless')
                edge_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser = Edge(
                    executable_path=get_file("./drivers/msedgedriver"),
                    options=edge_options,
                    capabilities={})
            elif sys.platform == 'darwin':
                _browser = Edge(
                    executable_path=get_file("./drivers/msedgedriver"),
                    capabilities={},
                    options=edge_options)
            elif sys.platform == 'win32':
                _browser = Edge(
                    executable_path=get_file("./drivers/msedgedriver"),
                    capabilities={},
                    options=edge_options)
            _browser.set_window_size(500, 700)
        elif browser_type == 'Firefox':
            # 先清除上次的日志
            if not os.path.exists(get_file("./logs")):
                os.mkdir(get_file("./logs/"))
            open(get_file("./logs/geckodriver.log"), "w").close()

            firefox_options = webdriver.FirefoxOptions()
            firefox_options.log.level = "fatal"
            if binary != "":
                firefox_options.binary_location = binary
            if headless:
                firefox_options.add_argument('--headless')
                firefox_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options,
                    service_log_path=get_file("./logs/geckodriver.log"))
            elif sys.platform == 'darwin':
                _browser = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options)
            elif sys.platform == 'win32':
                _browser = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options)
            _browser.set_window_size(500, 700)
        else:
            raise WebDriverException
        return _browser
    except WebDriverException:
        # 驱动问题
        print("ERROR", "浏览器错误", "请检查你下载并解压好的驱动是否放在drivers目录下")
コード例 #7
0
def main():
    searchtext = input()
    num_requested = int(input())
    number_of_scrolls = num_requested / 400 + 1
    # number_of_scrolls * 400 images will be opened in the browser

    if not os.path.exists(download_path + searchtext.replace(" ", "_")):
        os.makedirs(download_path + searchtext.replace(" ", "_"))

    url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
    chrome_driver_path = "msedgedriver.exe"
    browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe"
    option = EdgeOptions()
    option.binary_location = browser_path
    driver = Edge(executable_path = chrome_driver_path, options = option)
    driver.get(url)

    headers = {}
    headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    extensions = {"jpg", "jpeg", "png", "gif"}
    img_count = 0
    downloaded_img_count = 0

    for _ in range(int(number_of_scrolls)):
        for __ in range(15):
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        time.sleep(0.5)
        try:
            driver.find_element_by_xpath(
                "//input[@value='Show more results']").click()
        except Exception as e:
            print("Less images found: {}".format(e))
            break

    html = driver.page_source.split('"')
    imges = []
    links = []
    for i in html:
        if i.startswith('https:') and ('gstatic' not in i) and ('google' not in i):
            links.append(i.split('"')[0])
    for i in html:
        if i.startswith('http') and 'usqp=CAU' in i.split('.')[-1]:
            imges.append(i.split('"')[0])
    for i in html:
        if i.startswith('http') and i.split('"')[0].split('.')[-1] in extensions:
            imges.append(i.split('"')[0])
    links = list(set(links))
    imges = list(set(imges))
    print(imges)
    links_left = Diff(links, imges)

    #removing duplicates
    urls_new = []
    [urls_new.append(x) for x in links_left if x not in urls_new]

    file1 = open("page_source.txt", "w", encoding='utf8')
    file1.writelines(urls_new)
    img_type = []
    print("Total images: {}\n".format(len(imges)))
    for img in imges:
        img_count += 1
        print("Downloading image {}:{}".format(img_count, img))
        img_type = img.rsplit('.', 1)
        try:
            req = Request(img, headers=headers)
            raw_img = urlopen(req).read()
            f = open(download_path+searchtext.replace(" ", "_")+"/" +
                     str(downloaded_img_count)+"."+"jpeg", "wb")
            f.write(raw_img)
            f.close
            downloaded_img_count += 1
        except Exception as e:
            print("Download failed: {}".format(e))
        finally:
            print
        if downloaded_img_count >= num_requested:
            break

    print("Total downloaded: {}/{}".format(downloaded_img_count, img_count))
    print("Total images: {}\n".format(len(urls_new)))

    for url in urls_new:
        img_count = img_scp.img_download(url, download_path+searchtext.replace(" ", "_")+"/", img_count)
    driver.quit()
コード例 #8
0
def get_browser(_config_, path_prefix=""):
    """
    获取浏览器对象
    :return:
    """
    browser_type = _config_['selenium']['browserType']
    headless = _config_['selenium']['headless']
    binary = _config_['selenium']['binary']
    user_agent = _config_['user-agent'][0]
    _browser_ = None
    try:
        if browser_type == 'Chrome':
            chrome_options = webdriver.ChromeOptions()
            # 防止在某些情况下报错`
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_experimental_option(
                "excludeSwitches", ['enable-automation', 'enable-logging'])
            chrome_options.add_argument(f'user-agent={user_agent}')
            if binary != "":
                # 当找不到浏览器时需要在 config 里配置路径
                chrome_options.binary_location = binary
            if headless:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser_ = webdriver.Chrome(
                    executable_path=get_file(path_prefix +
                                             "./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            elif sys.platform == 'darwin':
                _browser_ = webdriver.Chrome(
                    executable_path=get_file(path_prefix +
                                             "./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            elif sys.platform == 'win32':
                _browser_ = webdriver.Chrome(
                    executable_path=get_file(path_prefix +
                                             "./drivers/chromedriver"),
                    desired_capabilities={},
                    options=chrome_options)
            _browser_.set_window_size(500, 700)
        elif browser_type == 'Edge':
            from msedge.selenium_tools import Edge, EdgeOptions
            edge_options = EdgeOptions()
            edge_options.use_chromium = True
            edge_options.add_argument('--no-sandbox')
            edge_options.add_argument('--disable-dev-shm-usage')
            edge_options.add_experimental_option(
                "excludeSwitches", ['enable-automation', 'enable-logging'])
            if binary != "":
                edge_options.binary_location = binary
            if headless:
                edge_options.add_argument('--headless')
                edge_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser_ = Edge(
                    executable_path=get_file(path_prefix +
                                             "./drivers/msedgedriver"),
                    options=edge_options,
                    capabilities={})
            elif sys.platform == 'darwin':
                _browser_ = Edge(
                    executable_path=get_file(path_prefix +
                                             "./drivers/msedgedriver"),
                    capabilities={},
                    options=edge_options)
            elif sys.platform == 'win32':
                _browser_ = Edge(
                    executable_path=get_file(path_prefix +
                                             "./drivers/msedgedriver"),
                    capabilities={},
                    options=edge_options)
            _browser_.set_window_size(500, 700)
        elif browser_type == 'Firefox':
            # 先清除上次的日志
            if not os.path.exists(get_file("./logs")):
                os.mkdir(get_file("./logs/"))
            open(get_file("./logs/geckodriver.log"), "w").close()

            firefox_options = webdriver.FirefoxOptions()
            firefox_options.log.level = "fatal"
            if binary != "":
                firefox_options.binary_location = binary
            if headless:
                firefox_options.add_argument('--headless')
                firefox_options.add_argument('--disable-gpu')
            if sys.platform == 'linux':
                _browser_ = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options,
                    service_log_path=get_file("./logs/geckodriver.log"))
            elif sys.platform == 'darwin':
                _browser_ = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options)
            elif sys.platform == 'win32':
                _browser_ = webdriver.Firefox(
                    executable_path=get_file('./drivers/geckodriver'),
                    options=firefox_options)
            _browser_.set_window_size(500, 700)
        else:
            raise WebDriverException
        return _browser_
    except WebDriverException as e:
        # 驱动问题
        if "This version of ChromeDriver only supports Chrome version" in e.args.__str__(
        ):
            print("\r[%s] [ERROR] 浏览器错误(chromedriver版本错误),请比对前三位版本号" %
                  (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        elif "'chromedriver' executable needs to be in PATH" in e.args.__str__(
        ):
            print("\r[%s] [ERROR] 浏览器错误,请检查你下载并解压好的驱动是否放在drivers目录下" %
                  (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        elif "unknown error: cannot find Chrome binary" in e.args.__str__():
            print(
                "\r[%s] [ERROR] 浏览器错误(Chrome浏览器可执行文件路径未成功识别),请在配置文件中修改selenium.binary为浏览器可执行文件绝对路径"
                % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        else:
            print(
                "\r[%s] [ERROR] 浏览器错误, 请检查你下载并解压好的驱动是否放在drivers目录下,如需帮助请及时反馈; err: %s"
                % (time.strftime("%Y-%m-%d %H:%M:%S",
                                 time.localtime()), e.args.__str__()))
        sys.exit(1)
コード例 #9
0
def open_browser(executable_path="msedgedriver",
                 edge_args=None,
                 desired_capabilities=None,
                 **kwargs):
    """Open Edge browser instance and cache the driver.

    Parameters
    ----------
    executable_path : str (Default "msedgedriver")
        path to the executable. If the default is used it assumes the
        executable is in the $PATH.
    port : int (Default 0)
        port you would like the service to run, if left as 0, a free port will
        be found.
    desired_capabilities : dict (Default None)
        Dictionary object with non-browser specific capabilities only, such as
        "proxy" or "loggingPref".
    chrome_args : Optional arguments to modify browser settings
    """
    options = EdgeOptions()
    options.use_chromium = True
    # If user wants to re-use existing browser session then
    # he/she has to set variable BROWSER_REUSE_ENABLED to True.
    # If enabled, then web driver connection details are written
    # to an argument file. This file enables re-use of the current
    # chrome session.
    #
    # When variables BROWSER_SESSION_ID and BROWSER_EXECUTOR_URL are
    # set from argument file, then OpenBrowser will use those
    # parameters instead of opening new chrome session.
    # New Remote Web Driver is created in headless mode.
    edge_path = kwargs.get(
        'edge_path', None) or BuiltIn().get_variable_value('${EDGE_PATH}')
    if edge_path:
        options.binary_location = edge_path

    if user.is_root():
        options.add_argument("no-sandbox")
    if edge_args:
        if any('--headless' in _.lower() for _ in edge_args):
            CONFIG.set_value('Headless', True)
        for item in edge_args:
            options.add_argument(item.lstrip())
    options.add_argument("start-maximized")
    options.add_argument("--disable-notifications")
    if 'headless' in kwargs:
        CONFIG.set_value('Headless', True)
        options.add_argument("--headless")
    if 'prefs' in kwargs:
        if isinstance(kwargs.get('prefs'), dict):
            prefs = kwargs.get('prefs')
        else:
            prefs = util.prefs_to_dict(kwargs.get('prefs').strip())
        options.add_experimental_option('prefs', prefs)
        logger.warn("prefs: {}".format(prefs))
    driver = Edge(BuiltIn().get_variable_value('${EDGEDRIVER_PATH}')
                  or executable_path,
                  options=options,
                  desired_capabilities=desired_capabilities)
    browser.cache_browser(driver)
    return driver