class Scrapper(): UserAgentList = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0", "Mozilla/5.0 (X11; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0", "Mozilla/5.0 (Windows NT 10.0; rv:68.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" ] def __init__(self, name, urls, str_file, test_url = "http://ipv4.plain-text-ip.com/", #test_url = "https://www.york.ac.uk/teaching/cws/wws/webpage1.html", data_folder = "{}/scrapped".format(thispath), proxy = False, headless= False, wait=1, max_nexts=0): logger.info("") logger.info("********************************************************************") logger.info("*** Inicializando el scrapper ***") logger.info("********************************************************************") logger.info("") self.name = name self.urls = urls self.str_file = str_file self.test_url = test_url self.data_folder = "{}/{}".format(data_folder,self.name) self.proxy = proxy if self.proxy: self.actual_proxy = False if isinstance(self.proxy,dict): try: self.proxy_list = proxies(typ=self.proxy["Types"], lim=self.proxy["Limit"], countries_list=self.proxy["Countries"]) except: raise else: self.proxy_list = proxies() else: self.proxy_list = False self.headless = headless self.wait = wait self.max_nexts = max_nexts self.data = [] self.previus_data = [] self.tread = False self.set_dom() self.configure_driver() self.crawl() def set_dom(self): dom_strc = dom(self.str_file) self.main_strc = dom_strc.main_strc self.data_strc = dom_strc.data_strc def configure_driver(self): self.firefox_capabilities = DesiredCapabilities.FIREFOX UserAgent = random.choice(self.UserAgentList) logger.info("User agent: {}".format(UserAgent)) if self.headless: self.options = Options() self.options.add_argument('--headless') self.firefox_profile = FirefoxProfile() self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True) self.firefox_profile.set_preference("general.useragent.override", UserAgent) self.firefox_profile.update_preferences() else: self.options = Options() self.firefox_profile = FirefoxProfile() self.firefox_profile.set_preference("browser.privatebrowsing.autostart", True) # Esta lineas no sirven por ahora, buscar alternativas """self.firefox_profile.set_preference('permissions.default.image',2) self.firefox_profile.set_preference("permissions.default.stylesheet",2)""" self.firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False) self.firefox_profile.set_preference('media.navigator.video.enabled',False) self.firefox_profile.set_preference('media.encoder.webm.enabled',False) self.firefox_profile.set_preference('media.ffmpeg.enabled',False) self.firefox_profile.set_preference('media.flac.enabled',False) self.firefox_profile.set_preference("general.useragent.override", UserAgent) self.firefox_profile.update_preferences() try: self.driver = Firefox(capabilities=self.firefox_capabilities, options=self.options, firefox_profile=self.firefox_profile, executable_path='geckodriver') except: raise def createDataFile(self): try: if not path.isdir(self.data_folder): makedirs(self.data_folder) if not path.isfile(self.data_file): with open(self.data_file, "w") as datafile: writer = csv.DictWriter(datafile, delimiter="\t", fieldnames=list(self.data_strc.keys())) writer.writeheader() except: raise def crawl(self): logger.info("Starting crawler") try: for enum, url in enumerate(self.urls): logger.info("Crawling {}".format(url["URL"])) filename = url["PARAM"].replace(" ","") self.data_file = "{}/{}.csv".format(self.data_folder,filename) self.createDataFile() self.url = url self.navigate() self.driver.close() logger.info("Finishing crawler") except: raise def navigate(self): try: self.set_proxy() self.driver.get(self.url["URL"]) self.check_containers() self.get_data() self.nexts = 0 self.next() except: self.navigate() def check_containers(self): try: print("---> Check containers") WebDriverWait(self.driver, 30).until(expected.visibility_of_element_located((By.CSS_SELECTOR, '{}'.format(self.main_strc["LiCont"]["Selector"])))) self.main_strc["LiCont"]["Elements"] = self.driver.find_elements_by_css_selector('{}'.format(self.main_strc["LiCont"]["Selector"])) self.main_strc["ElCont"]["Elements"] = [] if self.main_strc["LiCont"]["Elements"] != []: for element in self.main_strc["LiCont"]["Elements"]: self.main_strc["ElCont"]["Elements"] += element.find_elements_by_css_selector('{}'.format(self.main_strc["ElCont"]["Selector"])) if self.actual_proxy: self.proxy_list.proxy_notwork(self.actual_proxy.ip, self.actual_proxy.port) except TimeoutException as e: print("---> Nos cacharon") if self.actual_proxy: self.proxy_list.proxy_notwork(self.actual_proxy.host, self.actual_proxy.port) self.set_proxy() try: self.driver.close() self.navigate(self) except: self.navigate(self) pass except WebDriverException as e: print("---> Ni idea que pasó") if self.actual_proxy: self.proxy_list.proxy_notwork(self.actual_proxy.ip, self.actual_proxy.port) verified_proxy_host = "" self.set_proxy() try: self.driver.close() self.navigate(self) except: self.navigate(self) pass def get_data(self): try: if self.main_strc["ElCont"]["Elements"] != []: for element in self.main_strc["ElCont"]["Elements"]: data_dict = {} for key, selector in self.data_strc.items(): elements = element.find_elements_by_css_selector('{}'.format(selector["Selector"])) data_dict[key] = "" if key == "Link": data_dict[key] += elements[0].get_attribute('href') else: if len(elements) > 1: for e in elements: data_dict[key] += "<e>"+e.text+"</e>" elif len(elements)==1: data_dict[key] = elements[0].text with open(self.data_file, "a") as datafile: if "".join(list(data_dict.values())) != "": writer = csv.DictWriter(datafile, delimiter="\t", fieldnames=list(self.data_strc.keys())) writer.writerow(data_dict) except: raise def next(self): if self.main_strc["Next"]["Selector"]: self.nexts += 1 #print(self.max_nexts, self.nexts, self.max_nexts == 0 or self.nexts < self.max_nexts) if self.max_nexts == 0 or self.nexts < self.max_nexts: try: self.main_strc["Next"]["Elements"] = WebDriverWait(self.driver,2).until(expected.element_to_be_clickable((By.CSS_SELECTOR, '{}'.format(self.main_strc["Next"]["Selector"])))) wait = random.randrange(5,10) sleep(wait) self.main_strc["Next"]["Elements"].click() self.check_containers() self.get_data() self.next() except: raise def set_proxy(self): if self.proxy_list: self.proxy_list.select_proxy() self.actual_proxy = self.proxy_list.selected_proxy logger.info("Setting {}:{} from {} as proxy".format(self.actual_proxy.host, self.actual_proxy.port, self.actual_proxy.geo)) self.driver.execute("SET_CONTEXT", {"context": "chrome"}) try: self.driver.execute_script(""" Services.prefs.setIntPref('network.proxy.type', 1); Services.prefs.setCharPref("network.proxy.http", arguments[0]); Services.prefs.setIntPref("network.proxy.http_port", arguments[1]); Services.prefs.setCharPref("network.proxy.ssl", arguments[0]); Services.prefs.setIntPref("network.proxy.ssl_port", arguments[1]);""", self.actual_proxy.host,self.actual_proxy.port ) finally: self.driver.execute("SET_CONTEXT", {"context": "content"}) self.test_proxy() else: logger.info("------------------------La cagó pedazo e bola!!!!") """ Services.prefs.setIntPref('network.proxy.type', 1); Services.prefs.setCharPref("network.proxy.http", arguments[0]); Services.prefs.setIntPref("network.proxy.http_port", arguments[1]); Services.prefs.setCharPref("network.proxy.ssl", arguments[0]); Services.prefs.setIntPref("network.proxy.ssl_port", arguments[1]); Services.prefs.setCharPref("network.proxy.ftp", arguments[0]); Services.prefs.setIntPref("network.proxy.ftp_port", arguments[1]); """ def test_proxy(self, timeout = 10): if self.actual_proxy: logger.info("Testing {}:{} from {} as proxy".format(self.actual_proxy.host)) try: self.driver.set_page_load_timeout(timeout) self.driver.get(self.test_url) logger.info("{} worked in test page".format(self.actual_proxy.host)) except TimeoutException as e: if timeout == 10: test_proxy(self, timeout = 20) else: logger.info("{} Time out".format(self.actual_proxy.host)) self.proxy_list.proxy_notwork(self.actual_proxy.host self.actual_proxy.port) self.set_proxy() except WebDriverException as e: logger.info("{} Something goes wrong".format(self.actual_proxy.host)) self.proxy_list.proxy_notwork(self.actual_proxy.ip, self.actual_proxy.port) self.set_proxy()