def Chrome(self): params = self._params # 开启配置项 chrome_options chrome_options = webdriver.ChromeOptions() if params['headless']: # 无头模式 chrome_options.add_argument('--headless') if params['proxy'] is not None: # 设置代理 未试用过?! # chrome_options.add_argument("--proxy-server=http://127.0.0.1:10152") chrome_options.add_argument(params['proxy']) if params['local_user_dir']: # 使用本地Chrome的用户数据(比如cookie) sys_type = platform.system() # 获取当前系统的类型 if sys_type != "Windows": print('监测到当前系统不为windows,请自行填写profile_dir') exit(1) login_user = os.getlogin() # 获取当前登录系统的用户名 profile_dir = r'C:\Users\{}\AppData\Local\Google\Chrome\User Data'.format( login_user) print("预计本地Chrome浏览器的用户数据文件夹为:", profile_dir) chrome_options.add_argument("user-data-dir=" + os.path.abspath(profile_dir)) config_prefs = {} if not params['images']: # 禁止加载图片 config_prefs['images'] = 2 if not params['js']: # 禁止加载javascript脚本 config_prefs['javascript'] = 2 if params['ua']: # 使用随机UA config_prefs['User-Agent'] = random_ua() if config_prefs: prefs = {'profile.default_content_setting_values': config_prefs} chrome_options.add_experimental_option("prefs", prefs) # prefs = { # 'profile.default_content_setting_values': { # # 'images': 2, # 不加载图片 # 'javascript': 2, # 不加载JS # # "User-Agent": random_ua() # 随机UA # } # } # 其他的可以参考:https://blog.csdn.net/zwq912318834/article/details/78933910 # 还可参考:https://www.zhihu.com/question/35547395 # 创建浏览器 driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=self._executable_path) return driver
def PhantomJS(self): # 引入配置对象DesiredCapabilities from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # 开启配置项dcap dcap = dict(DesiredCapabilities.PHANTOMJS) # 随机选取UA dcap["phantomjs.page.settings.userAgent"] = random_ua() # 设置不载入图片 dcap["phantomjs.page.settings.loadImages"] = False # 设置代理 # service_args = ['--proxy=127.0.0.1:4860', '--proxy-type=socks5'] # socks5 ?? 还是http?? service_args = [] # 创建浏览器 driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=self._executable_path, service_args=service_args) return driver
def Chrome(self): params = self._params # 开启配置项 chrome_options chrome_options = webdriver.ChromeOptions() if params['headless']: # 无头模式 chrome_options.add_argument('--headless') if params['proxy'] is not None: # 设置代理 未试用过?! # chrome_options.add_argument("--proxy-server=http://127.0.0.1:10152") chrome_options.add_argument(params['proxy']) if params['local_config']: print("os.path.abspath(params['profile_dir']) = ", os.path.abspath(params['profile_dir'])) chrome_options.add_argument("user-data-dir=" + os.path.abspath(params['profile_dir'])) config_prefs = {} if not params['images']: # 禁止加载图片 config_prefs['images'] = 2 if not params['js']: # 禁止加载javascript脚本 config_prefs['javascript'] = 2 if params['ua']: # 使用随机UA config_prefs['User-Agent'] = random_ua() if config_prefs: prefs = {'profile.default_content_setting_values' : config_prefs} chrome_options.add_experimental_option("prefs", prefs) # prefs = { # 'profile.default_content_setting_values': { # # 'images': 2, # 不加载图片 # 'javascript': 2, # 不加载JS # # "User-Agent": random_ua() # 随机UA # } # } # 其他的可以参考:https://blog.csdn.net/zwq912318834/article/details/78933910 # 还可参考:https://www.zhihu.com/question/35547395 # 创建浏览器 driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=self._executable_path) return driver
def main(): # 1. 设置请求参数 proxies = { 'http': 'http://127.0.0.1:54422', 'https': 'https://127.0.0.1:54422', } headers = {'Connection': 'Keep-Alive' # ,'host': 'zhannei.baidu.com' # ,'ref??': '' , 'User-Agent': random_ua() } params = { } # 2. 发送请求 url = "http://www.baidu.com" # requests.get(url, headers=headers, proxies=proxies, params=params) response = atools_crawler.requests.get(url, headers=headers) # 3. 解析回复 page_source = response.content.decode('utf8') # 将页面源代码写到临时html文件 # with open('temp.html', 'r', encoding='utf8') as fout: # fout.write(page_source) # or 直接输出页面源代码 pprint(page_source)