def setUp(self): options = EdgeOptions() options.use_chromium = True options.binary_location = "C:\\Program Files (x86)\\Microsoft\\Edge Dev\\Application\\msedge.exe" dir = os.path.dirname(os.path.realpath(__file__)) edge_driver_path = dir + "\\edgedriver_win64\\msedgedriver.exe" self.driver = Edge(options=options, executable_path=edge_driver_path) self.driver.implicitly_wait(30) self.driver.maximize_window() self.driver.get("http://localhost:4200")
def __edge_driver(browser_options): edge_options = EdgeOptions() edge_options.use_chromium = True if browser_options.headless: edge_options.add_argument('headless') if browser_options.browser_binary_location: edge_options.binary_location = browser_options.browser_binary_location if browser_options.operating_system: edge_options.set_capability('platform', 'LINUX') if browser_options.webdriver_location: return Edge(options=edge_options, executable_path=browser_options.webdriver_location) return EdgeDriver.WebDriver(options=edge_options)
def spider_opened(self, spider): # # 在scrapy中创建driver对象,尽可能少的创建该对象。 # # 1. 在初始化方法中创建driver对象; # # 2. 在open_spider中创建driver对象; # # 3. 不要将driver对象的创建放在process_request(); options = EdgeOptions() # # 使用谷歌内核(加了反而报错,说chrome is not reachable,并且此时driver名字必须为msedgedriver.exe,正常应该必须是MicrosoftWebDriver.exe) # options.use_chromium = True # 浏览器可执行文件绝对路径 - 手动指定使用的浏览器位置 options.binary_location = r"MicrosoftWebDriver.exe" # options.add_argument("--remote-debugging-port=59692") # # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # options.headless = True # options.add_argument("--headless") # “–no-sandbox”参数是让Chrome在root权限下跑 # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') # 谷歌文档提到需要加上这个属性来规避bug # options.add_argument("disable-gpu") # 隐私模式 # options.add_argument("-inprivate") options.add_argument( "user-data-dir=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data" ) options.add_argument( "profile-directory=C:\\Users\\wangy\\AppData\\Local\\Microsoft\\Edge\\User Data\\Default" ) # options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" self.driver = Edge(options=options) # give time to login manually self.driver.get('https://i.cnblogs.com/posts?pageSize=100') time.sleep(30) spider.logger.info('Spider opened: %s' % spider.name)
def img_download(url, path, count): # get all images chrome_driver_path = "D:\\programming\\Machine learning\\ml_projects\\google image scraper\\msedgedriver.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe" option = EdgeOptions() option.binary_location = browser_path driver = Edge(executable_path=chrome_driver_path, options=option) try: driver.get(url) #time.sleep(10) for __ in range(10): driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(.2) imgs = get_all_images(url, driver) for img in imgs: # for each img, download it count = download(img, path, count) except WebDriverException: print("page down") return count
from msedge.selenium_tools import Edge, EdgeOptions from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException import os, datetime, csv import pickle driverOptions = EdgeOptions() driverOptions.use_chromium = True driverOptions.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" driverOptions.add_argument("--headless") driverOptions.add_experimental_option("excludeSwitches", ["enable-logging"]) browser = Edge(options=driverOptions) try: try: browser.get('https://www.netztest.at/') cookies = pickle.load( open(os.path.join(os.path.dirname(__file__), 'cookies.pkl'), "rb")) for cookie in cookies: if cookie['name'] == 'RMBTuuid': cookie['expiry'] = int( (datetime.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) - datetime.datetime(1970, 1, 1)).total_seconds() + 1209600) browser.add_cookie(cookie) print("added cookie ", cookie) except Exception:
def get_browser(_config): """ 获取浏览器对象 :return: """ browser_type = _config['selenium']['browserType'] headless = _config['selenium']['headless'] binary = _config['selenium']['binary'] user_agent = _config['user-agent'][0] try: if browser_type == 'Chrome': chrome_options = webdriver.ChromeOptions() # 防止在某些情况下报错` chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_experimental_option( "excludeSwitches", ['enable-automation', 'enable-logging']) chrome_options.add_argument(f'user-agent={user_agent}') if binary != "": # 当找不到浏览器时需要在 config 里配置路径 chrome_options.binary_location = binary if headless: chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser = webdriver.Chrome( executable_path=get_file("./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) elif sys.platform == 'darwin': _browser = webdriver.Chrome( executable_path=get_file("./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) elif sys.platform == 'win32': _browser = webdriver.Chrome( executable_path=get_file("./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) _browser.set_window_size(500, 700) elif browser_type == 'Edge': from msedge.selenium_tools import Edge, EdgeOptions edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('--no-sandbox') edge_options.add_argument('--disable-dev-shm-usage') edge_options.add_experimental_option( "excludeSwitches", ['enable-automation', 'enable-logging']) if binary != "": edge_options.binary_location = binary if headless: edge_options.add_argument('--headless') edge_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser = Edge( executable_path=get_file("./drivers/msedgedriver"), options=edge_options, capabilities={}) elif sys.platform == 'darwin': _browser = Edge( executable_path=get_file("./drivers/msedgedriver"), capabilities={}, options=edge_options) elif sys.platform == 'win32': _browser = Edge( executable_path=get_file("./drivers/msedgedriver"), capabilities={}, options=edge_options) _browser.set_window_size(500, 700) elif browser_type == 'Firefox': # 先清除上次的日志 if not os.path.exists(get_file("./logs")): os.mkdir(get_file("./logs/")) open(get_file("./logs/geckodriver.log"), "w").close() firefox_options = webdriver.FirefoxOptions() firefox_options.log.level = "fatal" if binary != "": firefox_options.binary_location = binary if headless: firefox_options.add_argument('--headless') firefox_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options, service_log_path=get_file("./logs/geckodriver.log")) elif sys.platform == 'darwin': _browser = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options) elif sys.platform == 'win32': _browser = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options) _browser.set_window_size(500, 700) else: raise WebDriverException return _browser except WebDriverException: # 驱动问题 print("ERROR", "浏览器错误", "请检查你下载并解压好的驱动是否放在drivers目录下")
def main(): searchtext = input() num_requested = int(input()) number_of_scrolls = num_requested / 400 + 1 # number_of_scrolls * 400 images will be opened in the browser if not os.path.exists(download_path + searchtext.replace(" ", "_")): os.makedirs(download_path + searchtext.replace(" ", "_")) url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch" chrome_driver_path = "msedgedriver.exe" browser_path = "C:\\Program Files (x86)\\Microsoft\\Edge Beta\\Application\\msedge.exe" option = EdgeOptions() option.binary_location = browser_path driver = Edge(executable_path = chrome_driver_path, options = option) driver.get(url) headers = {} headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" extensions = {"jpg", "jpeg", "png", "gif"} img_count = 0 downloaded_img_count = 0 for _ in range(int(number_of_scrolls)): for __ in range(15): driver.execute_script("window.scrollBy(0, 1000000)") time.sleep(0.2) time.sleep(0.5) try: driver.find_element_by_xpath( "//input[@value='Show more results']").click() except Exception as e: print("Less images found: {}".format(e)) break html = driver.page_source.split('"') imges = [] links = [] for i in html: if i.startswith('https:') and ('gstatic' not in i) and ('google' not in i): links.append(i.split('"')[0]) for i in html: if i.startswith('http') and 'usqp=CAU' in i.split('.')[-1]: imges.append(i.split('"')[0]) for i in html: if i.startswith('http') and i.split('"')[0].split('.')[-1] in extensions: imges.append(i.split('"')[0]) links = list(set(links)) imges = list(set(imges)) print(imges) links_left = Diff(links, imges) #removing duplicates urls_new = [] [urls_new.append(x) for x in links_left if x not in urls_new] file1 = open("page_source.txt", "w", encoding='utf8') file1.writelines(urls_new) img_type = [] print("Total images: {}\n".format(len(imges))) for img in imges: img_count += 1 print("Downloading image {}:{}".format(img_count, img)) img_type = img.rsplit('.', 1) try: req = Request(img, headers=headers) raw_img = urlopen(req).read() f = open(download_path+searchtext.replace(" ", "_")+"/" + str(downloaded_img_count)+"."+"jpeg", "wb") f.write(raw_img) f.close downloaded_img_count += 1 except Exception as e: print("Download failed: {}".format(e)) finally: print if downloaded_img_count >= num_requested: break print("Total downloaded: {}/{}".format(downloaded_img_count, img_count)) print("Total images: {}\n".format(len(urls_new))) for url in urls_new: img_count = img_scp.img_download(url, download_path+searchtext.replace(" ", "_")+"/", img_count) driver.quit()
def get_browser(_config_, path_prefix=""): """ 获取浏览器对象 :return: """ browser_type = _config_['selenium']['browserType'] headless = _config_['selenium']['headless'] binary = _config_['selenium']['binary'] user_agent = _config_['user-agent'][0] _browser_ = None try: if browser_type == 'Chrome': chrome_options = webdriver.ChromeOptions() # 防止在某些情况下报错` chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_experimental_option( "excludeSwitches", ['enable-automation', 'enable-logging']) chrome_options.add_argument(f'user-agent={user_agent}') if binary != "": # 当找不到浏览器时需要在 config 里配置路径 chrome_options.binary_location = binary if headless: chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser_ = webdriver.Chrome( executable_path=get_file(path_prefix + "./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) elif sys.platform == 'darwin': _browser_ = webdriver.Chrome( executable_path=get_file(path_prefix + "./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) elif sys.platform == 'win32': _browser_ = webdriver.Chrome( executable_path=get_file(path_prefix + "./drivers/chromedriver"), desired_capabilities={}, options=chrome_options) _browser_.set_window_size(500, 700) elif browser_type == 'Edge': from msedge.selenium_tools import Edge, EdgeOptions edge_options = EdgeOptions() edge_options.use_chromium = True edge_options.add_argument('--no-sandbox') edge_options.add_argument('--disable-dev-shm-usage') edge_options.add_experimental_option( "excludeSwitches", ['enable-automation', 'enable-logging']) if binary != "": edge_options.binary_location = binary if headless: edge_options.add_argument('--headless') edge_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser_ = Edge( executable_path=get_file(path_prefix + "./drivers/msedgedriver"), options=edge_options, capabilities={}) elif sys.platform == 'darwin': _browser_ = Edge( executable_path=get_file(path_prefix + "./drivers/msedgedriver"), capabilities={}, options=edge_options) elif sys.platform == 'win32': _browser_ = Edge( executable_path=get_file(path_prefix + "./drivers/msedgedriver"), capabilities={}, options=edge_options) _browser_.set_window_size(500, 700) elif browser_type == 'Firefox': # 先清除上次的日志 if not os.path.exists(get_file("./logs")): os.mkdir(get_file("./logs/")) open(get_file("./logs/geckodriver.log"), "w").close() firefox_options = webdriver.FirefoxOptions() firefox_options.log.level = "fatal" if binary != "": firefox_options.binary_location = binary if headless: firefox_options.add_argument('--headless') firefox_options.add_argument('--disable-gpu') if sys.platform == 'linux': _browser_ = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options, service_log_path=get_file("./logs/geckodriver.log")) elif sys.platform == 'darwin': _browser_ = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options) elif sys.platform == 'win32': _browser_ = webdriver.Firefox( executable_path=get_file('./drivers/geckodriver'), options=firefox_options) _browser_.set_window_size(500, 700) else: raise WebDriverException return _browser_ except WebDriverException as e: # 驱动问题 if "This version of ChromeDriver only supports Chrome version" in e.args.__str__( ): print("\r[%s] [ERROR] 浏览器错误(chromedriver版本错误),请比对前三位版本号" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) elif "'chromedriver' executable needs to be in PATH" in e.args.__str__( ): print("\r[%s] [ERROR] 浏览器错误,请检查你下载并解压好的驱动是否放在drivers目录下" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) elif "unknown error: cannot find Chrome binary" in e.args.__str__(): print( "\r[%s] [ERROR] 浏览器错误(Chrome浏览器可执行文件路径未成功识别),请在配置文件中修改selenium.binary为浏览器可执行文件绝对路径" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) else: print( "\r[%s] [ERROR] 浏览器错误, 请检查你下载并解压好的驱动是否放在drivers目录下,如需帮助请及时反馈; err: %s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), e.args.__str__())) sys.exit(1)
def open_browser(executable_path="msedgedriver", edge_args=None, desired_capabilities=None, **kwargs): """Open Edge browser instance and cache the driver. Parameters ---------- executable_path : str (Default "msedgedriver") path to the executable. If the default is used it assumes the executable is in the $PATH. port : int (Default 0) port you would like the service to run, if left as 0, a free port will be found. desired_capabilities : dict (Default None) Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref". chrome_args : Optional arguments to modify browser settings """ options = EdgeOptions() options.use_chromium = True # If user wants to re-use existing browser session then # he/she has to set variable BROWSER_REUSE_ENABLED to True. # If enabled, then web driver connection details are written # to an argument file. This file enables re-use of the current # chrome session. # # When variables BROWSER_SESSION_ID and BROWSER_EXECUTOR_URL are # set from argument file, then OpenBrowser will use those # parameters instead of opening new chrome session. # New Remote Web Driver is created in headless mode. edge_path = kwargs.get( 'edge_path', None) or BuiltIn().get_variable_value('${EDGE_PATH}') if edge_path: options.binary_location = edge_path if user.is_root(): options.add_argument("no-sandbox") if edge_args: if any('--headless' in _.lower() for _ in edge_args): CONFIG.set_value('Headless', True) for item in edge_args: options.add_argument(item.lstrip()) options.add_argument("start-maximized") options.add_argument("--disable-notifications") if 'headless' in kwargs: CONFIG.set_value('Headless', True) options.add_argument("--headless") if 'prefs' in kwargs: if isinstance(kwargs.get('prefs'), dict): prefs = kwargs.get('prefs') else: prefs = util.prefs_to_dict(kwargs.get('prefs').strip()) options.add_experimental_option('prefs', prefs) logger.warn("prefs: {}".format(prefs)) driver = Edge(BuiltIn().get_variable_value('${EDGEDRIVER_PATH}') or executable_path, options=options, desired_capabilities=desired_capabilities) browser.cache_browser(driver) return driver