def getDriver(): # 构建请求头 dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (headers) # 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 dcap["phantomjs.page.settings.userAgent"] = ( random.choice(user_agent_list)) # 不载入图片,爬页面速度会快很多 dcap["phantomjs.page.settings.loadImages"] = False headers["preProxy"] = get_proxy().get("proxy") proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': headers["preProxy"] # 'httpProxy': proxyPool[random.randint(0, # len(proxyPool) - 1)] # 代理ip和端口 }) # 把代理ip加入到技能中 proxy.add_to_capabilities(dcap) driver = webdriver.PhantomJS(executable_path='download/phantomjs.exe', desired_capabilities=dcap) # 隐式等待5秒,可以自己调节 # driver.implicitly_wait(5) # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项 # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(5) # 设置10秒脚本超时时间 driver.set_script_timeout(5) return driver
def openDriver(self): '123.115.240.148:8118' # 普匿 '111.77.100.60:8118' # 高匿 proxy = Proxy({ # 'proxyType': ProxyType.MANUAL, # 用不用都行 # 'httpProxy': '112.87.131.160:8118' # 'httpProxy': '111.77.100.60:8118' # 'httpProxy': '115.28.148.192:8118' # 27.上午 # 'httpProxy': '62.221.41.130:8080' # 27.上午 'httpProxy': '117.63.156.123:8118' }) # 新建一个“期望技能”,哈哈 desired_capabilities = DesiredCapabilities.FIREFOX.copy() # # 把代理ip加入到技能中 proxy.add_to_capabilities(desired_capabilities) host = "https://cordis.europa.eu" chromePath = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # firePath = "D:\\software\\Firefox\\geckodriver.exe" # self.driver = webdriver.Firefox(executable_path=firePath,desired_capabilities=desired_capabilities) self.driver = webdriver.Chrome( executable_path=chromePath, desired_capabilities=desired_capabilities) self.driver.get(host) self.driver.maximize_window() self.driver.implicitly_wait(10)
def _create_remote_driver(self, driver, **kwargs): if not 'remote_url' in kwargs: raise ValueError('Remote drivers require the declaration of a remote_url') remote_url = kwargs.get('remote_url') logger.info('Creating remot driver "%s" (remote_url=%s)', driver, remote_url) try: # Get a copy of the desired capabilities object. (to avoid overwriting the global.) capabilities = self.DRIVER_CAPABILITIES[driver].copy() except KeyError: raise TypeError("Unsupported Browser Type {0}".format(driver)) if 'capabilities' in kwargs: for c in kwargs.get('capabilities'): capabilities.update(c) if 'proxy' in kwargs: proxy_url = kwargs.get('proxy') proxy = Proxy({ 'httpProxy': proxy_url, 'ftpProxy': proxy_url, 'sslProxy': proxy_url, 'noProxy': None, 'proxyType': ProxyType.MANUAL, 'autodetect': False }) proxy.add_to_capabilities(capabilities) driver_instance = webdriver.Remote( desired_capabilities=capabilities, command_executor=remote_url ) return driver_instance
def __open_browser(use_proxy: bool = False): # TODO: add user agent chrome_options = webdriver.ChromeOptions() capabilities = webdriver.DesiredCapabilities.CHROME if use_proxy: random_proxy = Proxies.get_random_proxy() # Parse Proxy if '@' in random_proxy: auth, ip_port = random_proxy.split('@') user, pwd = auth.split(':') ip, port = ip_port.split(':') with zipfile.ZipFile(plugin_file, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js % (ip, port, user, pwd)) chrome_options.add_extension(plugin_file) else: prox = Proxy() prox.proxy_type = ProxyType.MANUAL prox.http_proxy = random_proxy prox.socks_proxy = random_proxy prox.ssl_proxy = random_proxy capabilities = webdriver.DesiredCapabilities.CHROME prox.add_to_capabilities(capabilities) return webdriver.Chrome(chrome_options=chrome_options)
def __init__(self, *args, seleniumwire_options=None, **kwargs): """Initialise a new Firefox WebDriver instance. Args: seleniumwire_options: The seleniumwire options dictionary. """ if seleniumwire_options is None: seleniumwire_options = {} try: firefox_options = kwargs['options'] except KeyError: firefox_options = FirefoxOptions() kwargs['options'] = firefox_options # Prevent Firefox from bypassing the Selenium Wire proxy # for localhost addresses. firefox_options.set_preference( 'network.proxy.allow_hijacking_localhost', True) firefox_options.accept_insecure_certs = True config = self._setup_backend(seleniumwire_options) if seleniumwire_options.get('auto_config', True): if SELENIUM_V4: # From Selenium v4.0.0 the browser's proxy settings can no longer # be passed using desired capabilities and we must use the options # object instead. proxy = Proxy() proxy.http_proxy = config['proxy']['httpProxy'] proxy.ssl_proxy = config['proxy']['sslProxy'] try: proxy.no_proxy = config['proxy']['noProxy'] except KeyError: pass firefox_options.proxy = proxy else: # Earlier versions of Selenium use capabilities to pass the settings. capabilities = kwargs.get('capabilities', kwargs.get('desired_capabilities')) if capabilities is None: capabilities = DesiredCapabilities.FIREFOX capabilities = capabilities.copy() capabilities.update(config) kwargs['capabilities'] = capabilities super().__init__(*args, **kwargs)
def openDriver(self): # Proxy是代理类,初始化参数是一个字典包含多个参数 # myProxy = "182.138.242.128:8118" # myProxy = "182.108.47.231:808" # myProxy = "113.140.1.82:53281" # myProxy = "47.94.135.32:8118" myProxy = "120.83.106.27:9999" proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy }) # executable_path="chromedriver", 浏览器驱动路径 # port=0, 端口 # options=None, 选项 # service_args=None, 要传递给驱动程序服务的参数列表 # desired_capabilities=None, 渴望能力,浏览器的Dictionary对象,仅限于“代理”或“日志记录首选项”等功能。 # service_log_path=None, 日志信息路径 # chrome_options=None, # keep_alive=True myCapabilities = DesiredCapabilities.CHROME.copy() # 创建自己的期望 proxy.add_to_capabilities(myCapabilities) # 将代理加入到期望值中 self.driver = webdriver.Chrome(executable_path=self.chromePath, desired_capabilities=myCapabilities) # self.driver = webdriver.Chrome(executable_path=self.chromePath) # self.driver = webdriver.Firefox(executable_path=self.firefoxPath) self.driver.maximize_window() self.driver.get(self.searchUrl) try: self.driver.implicitly_wait(10) except Exception as e: print(e) tabLi = self.driver.find_element_by_id("1_3") if None != tabLi: tabA = tabLi.find_element(By.TAG_NAME, "a") if None != tabA: tabA.click() try: self.driver.implicitly_wait(10) except Exception as e: print(e) self.listSearch() else: print("没有点击 专业检索!")
def prepare_desired_capabilities(self): capabilities = DesiredCapabilities.FIREFOX.copy() capabilities['javascriptEnabled'] = True # capabilities['pageLoadStrategy'] = 'normal' # Set proxy proxy_string = configs.RANDOM_PROXY(return_tuple=False) proxy = Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = proxy_string proxy.ssl_proxy = proxy_string # proxy.ftp_proxy = proxy_string # prox.socks_proxy = proxy_string # proxy.add_to_capabilities(capabilities) return capabilities
def _create_firefox_driver(proxy_url): _proxy = None if proxy_url: _proxy = Proxy({ 'proxyType': 'MANUAL', 'httpProxy': proxy_url, 'ftpProxy': proxy_url, 'sslProxy': proxy_url }) return webdriver.Firefox(proxy=_proxy)
def get_proxy(proxy_url: str): proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': proxy_url, 'ftpProxy': proxy_url, 'sslProxy': proxy_url, 'noProxy': 'localhost' # set this value as desired }) return proxy
def setUpClass(cls): myproxy = "http://rproxy.mcp.com:3128" proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myproxy, 'ftpProxy': myproxy, 'sslProxy': myproxy, 'noProxy': ''} ) cls.selenium = WebDriver(proxy=proxy) super(MySeleniumTests, cls).setUpClass()
def webDriverPhantomJS(url, **kwargs): ssl._create_default_https_context = ssl._create_unverified_context print("[info]webDriver:设置Header/代理IP") dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 " ) ipDict = getRandomOneIP() proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': ipDict['ip'] + ':' + ipDict['port'] }) # 代理ip和端口 proxy.add_to_capabilities(dcap) print("[info]webDriver:获取代理IP成功{}".format(str(ipDict))) if 'Linux' in platform.system(): driver = webdriver.PhantomJS( executable_path= '/root/xiaociwei_download/zywa_crawl_platform/plug/geckodriver/phantomjs-2.1.1-linux-x86_64/bin/phantomjs', service_args=['--ssl-protocol=any'], desired_capabilities=dcap) else: driver = webdriver.PhantomJS( executable_path= '/Users/magic/PycharmProjects/zywa-spider-xiaociwei/plug/geckodriver/phantomjs-2.1.1-macosx/bin/phantomjs', service_args=['--ssl-protocol=any'], desired_capabilities=dcap) try: print("[info]webDriver:初始化webDriver成功") driver.get(url) print("[info]webDriver:访问成功") __doAction(kwargs.get('action'), driver) print("[info]执行操作码成功") driver.save_screenshot('test2.png') return driver.page_source finally: print("[info]关闭driver成功") driver.quit()
def process_request(self, request, spider): desired_capabilities = DesiredCapabilities.CHROME.copy() proxy = Proxy( { 'proxyType': ProxyType.MANUAL, 'httpProxy': 'ip:port' # 代理ip和端口 } ) # proxy.add_to_capabilities(desired_capabilities) # spider.chrome.start_session(desired_capabilities) spider.chrome.get(request.url) content = self.get_response_content(spider.chrome) windows = spider.chrome.window_handles if len(windows) > 1: spider.chrome.close() body = content.encode('utf-8') response = TextResponse(url=request.url, request=request, body=body) return response
def openDriver(self): proxy = Proxy({ # 'proxyType': ProxyType.MANUAL, # 用不用都行 # 'httpProxy': '122.137.185.240:80' 'httpProxy': '123.232.199.89:8118' # 27.上午 # 'httpProxy': '115.159.155.83:8118' # 27.上午 # 'httpProxy': '115.219.12.145:8118' }) # 新建一个“期望技能”,哈哈 # desired_capabilities = DesiredCapabilities.FIREFOX.copy() # # 把代理ip加入到技能中 # proxy.add_to_capabilities(desired_capabilities) host = "https://cordis.europa.eu" chromePath = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" firePath = "D:\\software\\Firefox\\geckodriver.exe" #带代理请求 # self.driver = webdriver.Firefox(executable_path=firePath,desired_capabilities=desired_capabilities) self.driver = webdriver.Firefox(executable_path=firePath) self.driver.get(host) # 窗口最大化 self.driver.maximize_window() # 隐式等待10s self.driver.implicitly_wait(10)
def setup_driver(): myProxy = get_proxy() proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' }) profile = webdriver.FirefoxProfile() options = Options() # options.preferences.update({"javascript.enabled": True}) options.preferences.update({ "general.useragent.override": "Mozilla/5.0 Gecko/20100101 Firefox/66" }) options.preferences.update({"extensions.lastPlatformVersion": "66"}) options.preferences.update({"distribution.abut": "Mozilla Firefox"}) options.preferences.update({"intl.accept_languages": "en,en_US"}) driver = webdriver.Firefox(firefox_profile=profile, options=options, proxy=proxy) return driver
import os import time from selenium import webdriver from selenium.webdriver import Proxy, FirefoxProfile from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.webdriver.firefox.options import Options as firefox_options p = Proxy() p.http_proxy = "1.1.1.1" fp = FirefoxProfile() fp.accept_untrusted_certs = True fp.assume_untrusted_cert_issuer = False # fp.set_proxy(proxy) chrome_options = Options() firefox_options = firefox_options() firefox_options.headless = True chrome_options.headless = False # driver = webdriver.firefox(executable_path=os.getcwd() + os.path.sep + "geckodriver_mac") driver = webdriver.Chrome(options=chrome_options, executable_path=os.getcwd() + os.path.sep + "chromedriver_2.45") driver.implicitly_wait(30) driver.maximize_window() driver.get("https://www.google.com") print(driver.title) driver.find_element_by_name("q").send_keys("ramnath gokul", Keys.SHIFT) driver.find_element_by_name("q").send_keys(Keys.ENTER) # driver.find_element_by_name("btnK").click()
from pathlib import Path from selenium import webdriver from selenium.webdriver import FirefoxProfile, Proxy import os, time from selenium.webdriver.common.proxy import ProxyType from selenium.webdriver.firefox.options import Options p = Proxy() p.proxy_type = ProxyType.MANUAL p.httpProxy = "1.1.1.1:8080" x = Options() x.accept_insecure_certs = True # x.headless = True # x.proxy = p # x.accept_insecure_certs = True # x.set_preference("browser.download.defaultFolder", str(Path(os.getcwd()).parent) + os.path.sep + "AutomationDownloads") # myProxy = "86.111.144.194:3128" # proxy = Proxy({ # 'proxyType': ProxyType.MANUAL, # 'httpProxy': myProxy, # 'ftpProxy': myProxy, # 'sslProxy': myProxy, # 'noProxy':''})
def __init__(self, src, dest=None, unknown=None, chromedriver_location=None, proxy_server=None, fast_proxy=False): if chromedriver_location is None: self.chromedriver_location = os.path.abspath( os.path.dirname( sys.argv[0])) + "\chromedriver_win32\chromedriver.exe" else: self.chromedriver_location = chromedriver_location if debug: print("Chrome location:", self.chromedriver_location) if debug: print("src:", src) print("dest:", dest) self.f = open(r'C:\Python34\Projects\pimp-my-collection\text.txt', 'a') self.f.write('\n' + str(datetime.today()) + '\n') self.titles = [] #Куда кидать отсортированные if dest is None: try: self.dest = os.path.abspath(os.path.dirname(sys.argv[0])) os.chdir(self.dest) os.mkdir("sorted_images") except OSError: if debug: print("dest folder already exists") pass finally: self.dest = (self.dest + "\sorted_images") os.chdir(self.dest) else: self.dest = dest try: os.chdir(self.dest) except FileNotFoundError: print("No such directory:", self.dest) exit(1) #Папка для картинок без сурса if unknown is None: try: os.mkdir("unknown") except OSError: if debug: print("unkn folder already exists") pass finally: self.unknown = self.dest + r"\unknown" else: self.unknown = unknown try: os.mkdir(self.unknown) except OSError: if debug: print("unknown folder already exists") pass if debug: print("dest:", self.dest) print("unknown:", self.unknown) #Откуда берем картинки self.folder = src try: self.images = os.listdir(path=self.folder) except FileNotFoundError: print("No such directory:", self.folder) exit(1) if debug: for i in self.images: try: print(i) except UnicodeEncodeError: i = i.encode('ascii', 'ignore') print("bad unicode:", i) self.sleep_time = 3 self.proxy_sleep_time = 3 self.waiting_time = 15 self.fast_proxy = fast_proxy #Новая версия - новая прокси from selenium.webdriver import Proxy if proxy_server is None: proxy_server = "163.172.175.210:3128" #https://free-proxy-list.net/ settings = {"httpProxy": proxy_server, "sslProxy": proxy_server} self.proxy_server = Proxy(settings) else: settings = {"httpProxy": proxy_server, "sslProxy": proxy_server} self.proxy_server = Proxy(settings) from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities cap = DesiredCapabilities.CHROME.copy() cap['platform'] = "WINDOWS" cap['version'] = "10" #Без прокси self.driver = ChromeDriver(desired_capabilities=cap, executable_path=self.chromedriver_location) #С прокси self.proxy_server.add_to_capabilities(cap) self.driver2 = ChromeDriver(desired_capabilities=cap, executable_path=self.chromedriver_location)
'security.warn_entering_secure.show_once': True, 'security.warn_entering_weak': False, 'security.warn_entering_weak._show_once': True, 'security.warn_leaving_secure': False, 'security.warn_leaving_secure.show_once': True, 'security.warn_leaving_weak': False, 'security.warn_leaving_weak._show_once': True, 'security.warn_submit_insecure': False, 'security.warn_viewing_mixed': False, 'security.warn_viewing_mixed.show_once': True, } profile = FirefoxProfile() for name, value in default_profile.items(): profile.set_preference(name, value) proxy = Proxy() proxy.ftp_proxy = proxy.ssl_proxy = proxy.http_proxy = None browser = Firefox(firefox_profile=profile, proxy=proxy) tokyo_url = 'https://ticketcamp.net/venue/tokyo/' #東京に接続 browser.get(tokyo_url) ''' ''' url_list = queue.Queue() def get_url_from_this_page(): tags = browser.find_elements_by_class_name('name')
def get_tuned_driver(parser_name: str, logger: 'Logger', proxy_ip: Optional[str] = None, proxy_port: Optional[str] = None, headless: bool = True) -> 'WebDriver': os.environ["DISPLAY"] = ':99' chrome_options = Options() capabilities = DesiredCapabilities.CHROME capabilities['goog:loggingPrefs'] = {'browser': 'ALL'} if proxy_ip and proxy_port: prox = Proxy() prox.proxy_type = ProxyType.MANUAL prox.http_proxy = f"{proxy_ip}:{proxy_port}" prox.ssl_proxy = f"{proxy_ip}:{proxy_port}" try: response = requests.get('https://google.com', proxies={ 'http': f'{proxy_ip}:{proxy_port}', 'https': f'{proxy_ip}:{proxy_port}', }) except requests.RequestException: update_proxy_status(proxy_ip, AccessStatus.fail) raise if response.status_code != 200: update_proxy_status(proxy_ip, AccessStatus.fail) logger.critical(f'proxy {proxy_ip}:{proxy_port} not work') exit(-1) update_proxy_status(proxy_ip, AccessStatus.success) prox.add_to_capabilities(capabilities) logger.info(f'{parser_name} use proxy: {proxy_ip}:{proxy_port}') if headless: chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--headless") chrome_options.add_argument("--remote-debugging-port=9222") chrome_options.add_argument("--disable-infobars") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--no-sandbox") driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities) else: driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities) prefs = {"profile.default_content_setting_values.notifications": 2} chrome_options.add_experimental_option('prefs', prefs) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_argument('start-maximized') chrome_options.add_argument('incognito') driver.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined, enumerable: false, configurable: true }); const newProto = navigator.__proto__; delete newProto.webdriver; navigator.__proto__ = newProto; delete navigator.webdriver; """ }) driver.execute_cdp_cmd( 'Network.setUserAgentOverride', { "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/83.0.4103.53 Safari/537.36' }) driver.implicitly_wait(5) return driver
class Pimper: def __init__(self, src, dest=None, unknown=None, chromedriver_location=None, proxy_server=None, fast_proxy=False): if chromedriver_location is None: self.chromedriver_location = os.path.abspath( os.path.dirname( sys.argv[0])) + "\chromedriver_win32\chromedriver.exe" else: self.chromedriver_location = chromedriver_location if debug: print("Chrome location:", self.chromedriver_location) if debug: print("src:", src) print("dest:", dest) self.f = open(r'C:\Python34\Projects\pimp-my-collection\text.txt', 'a') self.f.write('\n' + str(datetime.today()) + '\n') self.titles = [] #Куда кидать отсортированные if dest is None: try: self.dest = os.path.abspath(os.path.dirname(sys.argv[0])) os.chdir(self.dest) os.mkdir("sorted_images") except OSError: if debug: print("dest folder already exists") pass finally: self.dest = (self.dest + "\sorted_images") os.chdir(self.dest) else: self.dest = dest try: os.chdir(self.dest) except FileNotFoundError: print("No such directory:", self.dest) exit(1) #Папка для картинок без сурса if unknown is None: try: os.mkdir("unknown") except OSError: if debug: print("unkn folder already exists") pass finally: self.unknown = self.dest + r"\unknown" else: self.unknown = unknown try: os.mkdir(self.unknown) except OSError: if debug: print("unknown folder already exists") pass if debug: print("dest:", self.dest) print("unknown:", self.unknown) #Откуда берем картинки self.folder = src try: self.images = os.listdir(path=self.folder) except FileNotFoundError: print("No such directory:", self.folder) exit(1) if debug: for i in self.images: try: print(i) except UnicodeEncodeError: i = i.encode('ascii', 'ignore') print("bad unicode:", i) self.sleep_time = 3 self.proxy_sleep_time = 3 self.waiting_time = 15 self.fast_proxy = fast_proxy #Новая версия - новая прокси from selenium.webdriver import Proxy if proxy_server is None: proxy_server = "163.172.175.210:3128" #https://free-proxy-list.net/ settings = {"httpProxy": proxy_server, "sslProxy": proxy_server} self.proxy_server = Proxy(settings) else: settings = {"httpProxy": proxy_server, "sslProxy": proxy_server} self.proxy_server = Proxy(settings) from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities cap = DesiredCapabilities.CHROME.copy() cap['platform'] = "WINDOWS" cap['version'] = "10" #Без прокси self.driver = ChromeDriver(desired_capabilities=cap, executable_path=self.chromedriver_location) #С прокси self.proxy_server.add_to_capabilities(cap) self.driver2 = ChromeDriver(desired_capabilities=cap, executable_path=self.chromedriver_location) def find_on_yandere(self): try: source = self.driver.find_element_by_class_name( 'tag-type-copyright') except NoSuchElementException: if debug: print("no source") return None if debug: print(source) print(source.text) source2 = source.find_elements_by_css_selector('a') if debug: print(source2) for i in source2: print(i.text) print(i.get_attribute('href')) print(source2[1].text) return source2[1].text def find_on_sankaku(self, addr): try: source = self.driver2.find_element_by_class_name( 'tag-type-copyright') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out") self.driver2.get(addr) sleep(self.proxy_sleep_time) source = self.driver2.find_element_by_class_name( 'tag-type-copyright') if not self.fast_proxy: sleep(self.proxy_sleep_time) except NoSuchElementException: if not self.fast_proxy: if debug: print("no element") self.driver2.get(addr) sleep(self.proxy_sleep_time) try: source = self.driver2.find_element_by_class_name( 'tag-type-copyright') sleep(self.proxy_sleep_time) except NoSuchElementException: if debug: print("actually no element") return None else: return None if debug: print(source) print(source.text) if not self.fast_proxy: sleep(self.proxy_sleep_time) try: source2 = source.find_elements_by_css_selector('a') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: sleep(self.proxy_sleep_time) if debug: print(source2) for i in source2: print(i.text) print(i.get_attribute('href')) print(source2[0].text) return source2[0].text def find_on_eshuushuu(self): got_source = False source = self.driver.find_elements_by_class_name('quicktag') check = self.driver.find_elements_by_tag_name('dt') if debug: for i in source: it = i.text try: print(it) print(i.get_attribute('span')) except UnicodeEncodeError: it = it.encode('ascii', 'ignore') print("bad unicode:", it) print(check) print("possible source:", source[1].text[1:len(source[1].text) - 1]) for i in check: if debug: print(i.text) if i.text.find("Source") != -1: return source[1].text[1:len(source[1].text) - 1] return None def find_on_danbooru(self, addr): try: source = self.driver2.find_element_by_class_name('category-3') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out") self.driver2.get(addr) sleep(self.proxy_sleep_time) source = self.driver2.find_element_by_class_name('category-3') if not self.fast_proxy: sleep(self.proxy_sleep_time) except NoSuchElementException: if not self.fast_proxy: if debug: print("no element") self.driver2.get(addr) sleep(self.proxy_sleep_time) try: source = self.driver2.find_element_by_class_name( 'category-3') sleep(self.proxy_sleep_time) except NoSuchElementException: if debug: print("actually no element") return None else: return None if debug: print(source) try: source2 = source.find_elements_by_css_selector('a') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out source 2") sleep(self.proxy_sleep_time) if debug: print(source2) for i in source2: print(i.text) print(i.get_attribute('href')) print("source:", source2[1].text) return source2[1].text def find_on_gelbooru(self, addr): try: source = self.driver2.find_element_by_class_name( 'tag-type-copyright') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out") self.driver2.get(addr) sleep(self.proxy_sleep_time) source = self.driver2.find_element_by_class_name( 'tag-type-copyright') if not self.fast_proxy: sleep(self.proxy_sleep_time) except NoSuchElementException: if not self.fast_proxy: if debug: print("no element") self.driver2.get(addr) sleep(self.proxy_sleep_time) try: source = self.driver2.find_element_by_class_name( 'tag-type-copyright') sleep(self.proxy_sleep_time) except NoSuchElementException: if debug: print("actually no element") return None else: return None if debug: print(source) try: source2 = source.find_elements_by_css_selector('a') if not self.fast_proxy: sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out source 2") sleep(self.proxy_sleep_time) if debug: print(source2) for i in source2: print(i.text) print(i.get_attribute('href')) print("source:", source2[1].text) return source2[1].text def move_image(self, folder_name): img = (self.img_name[1:len(self.img_name)]).encode('ascii', 'ignore') #Сурс не нашелся if folder_name is None: dest = (self.unknown).encode('ascii', 'ignore') try: if debug: print("src:", self.folder + self.img_name) print("dst:", dest) shutil.copy(self.folder + self.img_name, self.unknown) os.remove(self.folder + self.img_name) print("image", img, "successfully moved in", dest) except: print("Error while moving image", img) #Сурс найден else: dest = (self.dest + r'\n'[:-1] + folder_name).encode( 'ascii', 'ignore') #Убираем запрещенные символы для имени папки forbidden_symbols = re.findall('[*|\:"<>?/]', folder_name) for symb in forbidden_symbols: if debug: print(symb) folder_name = folder_name.replace(symb, "").lower() if debug: print("new folder name:", folder_name) print("writing...") if (folder_name not in self.titles): try: self.f.write(folder_name + '\n') self.titles.append(folder_name) except UnicodeEncodeError: pass try: os.mkdir(folder_name) except OSError: if debug: print("folder", folder_name, "already exists") pass try: shutil.copy(self.folder + self.img_name, folder_name) os.remove(self.folder + self.img_name) print("image", img, "successfully moved in", dest) except OSError: print("Error while moving image", img) sleep(self.sleep_time) #Приоритет сайтов def sort_addresses(self, pic_addr): variants = self.driver.find_element_by_id( 'pages').find_elements_by_tag_name('td') if debug: print("find %") for i in variants: try: print(i.text) except UnicodeEncodeError: new_i = i.text.encode('ascii', 'ignore') print("bad unicode:", new_i) for addr in pic_addr: addr2 = addr.get_attribute('href') print("trying", addr2) print("1st variant:", variants[6].text, "len =", len(variants), "len var = ", len(variants[6].text)) #Второе найденное similarity if len(variants[6].text) == 0: pos = 9 else: pos = 10 priority = 6 best_addr = pic_addr[0].get_attribute('href') if (best_addr.find("danbooru")) != -1: if debug: print("danbooru[0]") priority = 3 elif (best_addr.find("sankaku")) != -1: if debug: print("sankaku[0]") priority = 4 elif (best_addr.find("gelbooru")) != -1: if debug: print("gelbooru[0]") priority = 5 elif (best_addr.find("shuushuu")) != -1: if debug: print("shuushuu[0]") priority = 2 elif (best_addr.find("yande")) != -1: if debug: print("yandere[0]") priority = 1 if priority > 1: for addr in pic_addr[1:len(pic_addr)]: addr2 = addr.get_attribute('href') if pos > len(variants): break similarity = int(re.search('\d+', variants[pos].text).group()) if debug: print("similarity =", similarity) #if similarity >= 70: if (addr2.find("danbooru")) != -1: if debug: print("danbooru", priority) if priority > 3: best_addr = addr2 priority = 3 elif (addr2.find("sankaku")) != -1: if debug: print("sankaku", priority) if priority > 4: best_addr = addr2 priority = 4 elif (addr2.find("gelbooru")) != -1: if debug: print("gelbooru", priority) if priority > 5: best_addr = addr2 priority = 5 elif (addr2.find("shuushuu")) != -1: if debug: print("shuushuu", priority) if priority > 2: best_addr = addr2 priority = 2 break elif (addr2.find("yande")) != -1: if debug: print("yandere", priority) if priority > 1: best_addr = addr2 priority = 1 break pos += 4 #Следующее similarity if debug: print("best_addr:", best_addr) return best_addr, priority def search_for_source(self, pic_addr): best_addr, priority = self.sort_addresses(pic_addr) folder_name = None if debug: print("trying", best_addr) if priority == 1: print("searching on yandere") try: self.driver.get(best_addr) except WebDriverException as inst: if debug: print(inst) exit(1) folder_name = self.find_on_yandere() elif priority == 4: print("searching on sankaku") try: self.driver2.get(best_addr) sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out in if") sleep(self.proxy_sleep_time) except WebDriverException as inst: if debug: print(inst) exit(1) folder_name = self.find_on_sankaku(best_addr) elif priority == 2: print("searching on e-shuushuu") try: self.driver.get(best_addr) except WebDriverException as inst: if debug: print(inst) exit(1) folder_name = self.find_on_eshuushuu() elif priority == 3: print("searching on danbooru") try: self.driver2.get(best_addr) sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out in if") sleep(self.proxy_sleep_time) except WebDriverException as inst: if debug: print(inst) exit(1) folder_name = self.find_on_danbooru(best_addr) elif priority == 5: print("searching on gelbooru") try: self.driver2.get(best_addr) sleep(self.proxy_sleep_time) except TimeoutException: if debug: print("time out in if") sleep(self.proxy_sleep_time) except WebDriverException as inst: if debug: print(inst) exit(1) folder_name = self.find_on_gelbooru(best_addr) if folder_name is None: print("No relevant match for", self.img_name[1:len(self.img_name)]) self.move_image(folder_name) def iqdb_actions(self): for image in self.images: print("\nprocessing", self.images.index(image) + 1, "of", len(self.images)) self.img_name = r'\n'[:-1] + image if debug: try: print(self.folder + self.img_name) except UnicodeEncodeError: print("bad unicode") sleep(self.sleep_time) if ((image[len(image) - 4:] != ".jpg") and (image[len(image) - 4:] != ".png") and (image[len(image) - 5:] != ".jpeg")): try: print("Unsupported format:", image) except UnicodeEncodeError: image = image.encode('ascii', 'ignore') print(image) else: self.driver.get("http://iqdb.org/") #Вставляем изображение element = ui.WebDriverWait( self.driver, self.waiting_time).until( lambda driver: self.driver.find_element_by_id("file")) if debug: print(element) element.send_keys(self.folder + self.img_name) #Сабмитим element = ui.WebDriverWait( self.driver, self.waiting_time).until( lambda driver: self.driver.find_element_by_xpath( "//input[@value='submit']")) if debug: print(element) try: element.click() except TimeoutException: sleep(self.sleep_time) sleep(self.sleep_time) #Ищем лучшее совпадение try: pic_addr = ui.WebDriverWait( self.driver, self.waiting_time).until( lambda driver: self.driver. find_elements_by_css_selector('.image a')) except TimeoutException: print("Image", image, "is to o large") self.move_image(None) else: if debug: print(pic_addr) matches = ui.WebDriverWait( self.driver, self.waiting_time ).until(lambda driver: self.driver.find_element_by_xpath( '//*[@id="pages"]/div[2]/table/tbody/tr[1]/th')) if debug: print("matches:", matches) print(matches.text) if (matches.text.find("No")) != -1: print(matches.text, "for", image) self.move_image(None) else: self.search_for_source(pic_addr) sleep(self.sleep_time) def pimp(self): if debug: print("proxy mode:", self.fast_proxy) try: self.iqdb_actions() except KeyboardInterrupt: print("Stop working...") finally: self.driver.quit() self.driver2.quit() self.f.close() print("Job's done")