def set_proxy(self): if self.proxy_list: self.proxy_list.select_proxy() self.actual_proxy = self.proxy_list.selected_proxy logger.info("Setting {}:{} from {} as proxy".format(self.actual_proxy.host, self.actual_proxy.port, self.actual_proxy.geo)) self.driver.execute("SET_CONTEXT", {"context": "chrome"}) try: self.driver.execute_script(""" Services.prefs.setIntPref('network.proxy.type', 1); Services.prefs.setCharPref("network.proxy.http", arguments[0]); Services.prefs.setIntPref("network.proxy.http_port", arguments[1]); Services.prefs.setCharPref("network.proxy.ssl", arguments[0]); Services.prefs.setIntPref("network.proxy.ssl_port", arguments[1]);""", self.actual_proxy.host,self.actual_proxy.port ) finally: self.driver.execute("SET_CONTEXT", {"context": "content"}) self.test_proxy() else: logger.info("------------------------La cagó pedazo e bola!!!!") """
def get_proxies(self): self.consults += 1 if self.consults < 10: try: logger.info("ProxyGetter: ---> Starting to get proxies") proxies = asyncio.Queue() broker = Broker(proxies) tasks = asyncio.gather( broker.find(types=self.types, limit=self.limit, countries=self.countries_list), self.append_proxies(proxies)) loop = asyncio.get_event_loop() loop.run_until_complete(tasks) self.retries = 0 except RuntimeError: self.retries += 1 logger.info( "ProxyGetter: ---> Getproxy fail, waiting {} to the next try" .format(5 * self.retries)) sleep(5 * self.retries) self.get_proxies() else: sleep(5) self.get_proxies()
def __init__(self, name, urls, str_file, test_url = "http://ipv4.plain-text-ip.com/", #test_url = "https://www.york.ac.uk/teaching/cws/wws/webpage1.html", data_folder = "{}/scrapped".format(thispath), proxy = False, headless= False, wait=1, max_nexts=0): logger.info("") logger.info("********************************************************************") logger.info("*** Inicializando el scrapper ***") logger.info("********************************************************************") logger.info("") self.name = name self.urls = urls self.str_file = str_file self.test_url = test_url self.data_folder = "{}/{}".format(data_folder,self.name) self.proxy = proxy if self.proxy: self.actual_proxy = False if isinstance(self.proxy,dict): try: self.proxy_list = proxies(typ=self.proxy["Types"], lim=self.proxy["Limit"], countries_list=self.proxy["Countries"]) except: raise else: self.proxy_list = proxies() else: self.proxy_list = False self.headless = headless self.wait = wait self.max_nexts = max_nexts self.data = [] self.previus_data = [] self.tread = False self.set_dom() self.configure_driver() self.crawl()
def configure_driver(self): self.firefox_capabilities = DesiredCapabilities.FIREFOX UserAgent = random.choice(self.UserAgentList) logger.info("User agent: {}".format(UserAgent)) if self.headless: self.options = Options() self.options.add_argument('--headless') self.firefox_profile = FirefoxProfile() self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True) self.firefox_profile.set_preference("general.useragent.override", UserAgent) self.firefox_profile.update_preferences() else: self.options = Options() self.firefox_profile = FirefoxProfile() self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True) # Esta lineas no sirven por ahora, buscar alternativas """self.firefox_profile.set_preference('permissions.default.image',2) self.firefox_profile.set_preference("permissions.default.stylesheet",2)""" self.firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False) self.firefox_profile.set_preference('media.navigator.video.enabled',False) self.firefox_profile.set_preference('media.encoder.webm.enabled',False) self.firefox_profile.set_preference('media.ffmpeg.enabled',False) self.firefox_profile.set_preference('media.flac.enabled',False) self.firefox_profile.set_preference("general.useragent.override", UserAgent) self.firefox_profile.update_preferences() try: self.driver = Firefox(capabilities=self.firefox_capabilities, options=self.options, firefox_profile=self.firefox_profile, executable_path='geckodriver') except: raise
def test_proxy(self, timeout = 10): if self.actual_proxy: logger.info("Testing {}:{} from {} as proxy".format(self.actual_proxy.host)) try: self.driver.set_page_load_timeout(timeout) self.driver.get(self.test_url) logger.info("{} worked in test page".format(self.actual_proxy.host)) except TimeoutException as e: if timeout == 10: test_proxy(self, timeout = 20) else: logger.info("{} Time out".format(self.actual_proxy.host)) self.proxy_list.proxy_notwork(self.actual_proxy.host self.actual_proxy.port) self.set_proxy() except WebDriverException as e: logger.info("{} Something goes wrong".format(self.actual_proxy.host)) self.proxy_list.proxy_notwork(self.actual_proxy.ip, self.actual_proxy.port) self.set_proxy()
def proxy_notwork(self, host, port): logger.info("ProxyGetter: -----> Proxies in list before : {} ".format( len(self.proxy_list))) if len(self.proxy_list) > 0: for proxy in self.proxy_list: if proxy.host == host and proxy.port == port: logger.info( "ProxyGetter: ---> {} doesn't work, deleting".format( host)) self.proxy_list.remove(proxy) logger.info( "ProxyGetter: -----> Proxies in list after : {} ".format( len(self.proxy_list)))
def crawl(self): logger.info("Starting crawler") try: for enum, url in enumerate(self.urls): logger.info("Crawling {}".format(url["URL"])) filename = url["PARAM"].replace(" ","") self.data_file = "{}/{}.csv".format(self.data_folder,filename) self.createDataFile() self.url = url self.navigate() self.driver.close() logger.info("Finishing crawler") except: raise
def select_proxy(self): if len(self.proxy_list) > 0: if self.proxy_list[0].is_working: self.selected_proxy = self.proxy_list[0] logger.info("ProxyGetter: ---> {} selected".format( self.selected_proxy.host)) else: logger.info( "ProxyGetter: ---> {} reported as not working, deleting". format(self.selected_proxy.host)) self.proxy_list.remove(proxy) self.select_proxy() else: logger.info("ProxyGetter: ---> Proxies list exhausted") self.get_proxies() self.select_proxy()