def check_spiders_status(uuid): """ Checks the status integrity among the launched scrapy sub-processes and their status in DB. It is preventing from zombie/defunc processes that never update their status in DB, remaining in ONGOING forever. Differences between status in DB and the real alive spider processes, tell us what were the crashed spiders. :param uuid: str - Crawling process UUID """ with db_session: ongoing_db_sites = dbutils.get_sites_names_by_processing_status( dbsettings.Status.ONGOING, uuid) logging.debug( "There are %s ongoing sites in db and %s alive spider processes.", len(ongoing_db_sites), len(alive_spiders)) for site in ongoing_db_sites: logging.debug("Current alive spiders %s", list(alive_spiders.keys())) p_status = psutil.Process(alive_spiders[site].pid).status() logging.debug("Spider/Site %s is %s.", site, p_status) del p_status # Is it not running? if (site not in list( alive_spiders.keys())) or (alive_spiders[site].poll() is not None): dbutils.set_site_current_processing_status( s_status=dbsettings.Status.ERROR_DEFUNC, s_url=site) alive_spiders.pop(site) logging.debug("Site %s has been set up to ERROR_DEFUNC", site)
def set_seeds(n_seeds): """ Try to assign to this manager a number of seeds in PRE_DISCOVERING status. :param n_seeds: int - Number of seeds to be assigned """ with db_session: # Gets all initial seeds seed_sites = dbutils.get_sites_names_by_processing_status( dbsettings.Status.PRE_DISCOVERING, uuid='') logging.debug("There are %s seeds sites.", len(seed_sites)) # Get the first n seeds seed_sites = seed_sites[:n_seeds] # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created for site in seed_sites: try: with db_session: # is it a new site? Create it and set up the status to pending. if dbutils.update_seed_site(s_url=site, s_uuid=uuid): dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.DISCOVERING) except Exception: logging.exception( "ERROR: site %s could not be assigned to me. Maybe it is already managed by other" " manager.", site)
def error_to_pending(error_sites, pending_sites): """ ERROR sites are set up to PENDING if the max crawling tries are not exceeded. :param error_sites: list - List of current ERROR sites :param pending_sites: list - List of current PENDING sites """ # Error sites should be tagged as pending sites. with db_session: for site in error_sites: if dbutils.get_site( s_url=site ).error_tries < settings.MAX_CRAWLING_ATTEMPTS_ON_ERROR: logging.debug( "The site %s has been restored. New status PENDING.", site) pending_sites.insert(0, site) # sets up the error site to pending status dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.PENDING) else: logging.debug( "The site %s cannot be crawled because the number of max_tries on ERROR status has been reached.", site) logging.debug("Setting up the DISCOVERING status to %s", site) # The site cannot be crawled dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.DISCOVERING) dbutils.reset_tries_on_error(s_url=site)
def get_sites_from_floodfill(): """ Creates new sites from floodfill sites """ # Gets initial seeds seed_sites = siteutils.get_seeds_from_file(darknetsettings.PATH_DATA + "floodfill_seeds.txt") logging.debug("There are %s floodfill sites.", len(seed_sites)) # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created for site in seed_sites: try: with db_session: # is it a new site? Create it and set up the status to pending. site_type = siteutils.get_type_site(site) if dbutils.create_site(s_url=site, s_uuid=uuid, s_type=site_type, s_source=dbsettings.Source.FLOODFILL): dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.DISCOVERING) except Exception: logging.exception("ERROR: site %s could not be created.", site)
def process_fail(): ''' EN: It processes the files with ".fail" extension. SP: Procesa los ficheros con extensión ".fail". It deletes the files with the ".fail" extension from the /finished directory and adds the failed site to the pending_sites list so that the site can be crawled again. Elimina los ficheros con extensión ".fail" del directorio /finished y añade el site fallido a la lista pending_sites para que se vuelva a crawlear. ''' logging.debug("Dentro de process_fail()") global fail_files files_to_remove = [] logging.debug("Fail_files antes del bucle: " + str(fail_files)) try: for fil in fail_files: files_to_remove.append(fil) eliminar = "i2p/spiders/ongoing/" + fil.replace(".fail", ".json") os.remove(eliminar) eliminar = "i2p/spiders/finished/" + fil os.remove(eliminar) # If the crawling process failed, there was an ERROR site = fil.replace(".fail","") with db_session: dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.ERROR) logging.debug("Setting the ERROR status to site %s",site) except Exception as e: logging.error("There has been some error with the files") finally: for i in files_to_remove: fail_files.remove(i) logging.debug("Fail_files despues del bucle: " + str(fail_files))
def add_to_database(site, targeted_sites): ''' EN: It adds the extracted data by the crawler to the database. SP: Añade los datos extraídos por el crawler a la base de datos. :param site: site in question to add to the database / site en cuestión a añadir a la base de datos :param targeted_sites: sites to which the site points at / sitios a los que el site apunta ''' logging.debug("Dentro de add_to_database()") try: with db_session: # Creates the src site, if needed dbutils.create_site(site) dbutils.set_site_current_processing_status(s_url=site,s_status=settings.Status.FINISHED) for eepsite in targeted_sites: # is it a new site? Create it and set up the status to pending. if dbutils.create_site(s_url=eepsite): dbutils.set_site_current_processing_status(s_url=eepsite, s_status=settings.Status.PENDING) # Linking dbutils.create_link(site, eepsite) except Exception as e: logging.error("Something was wrong with the database") raise e
def link_darksites(site, targeted_sites): ''' EN: It adds the extracted data by the crawler to the database. SP: Añade los datos extraídos por el crawler a la base de datos. :param site: site in question to add to the database / site en cuestión a añadir a la base de datos :param targeted_sites: sites to which the site points at / sitios a los que el site apunta ''' logging.debug("Linking %s to %s ", site, targeted_sites) try: with db_session: dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.FINISHED) logging.debug("Site %s was setup to FINISHED.", site) for darksite in targeted_sites: try: with db_session: site_type = siteutils.get_type_site(darksite) # is it a new site? Create it and set up the status to pending. site_exists = False if site_type.name is "FREENET" and ("USK@" in darksite or "SSK@" in darksite): site_exists = siteutils.compare_freesite(darksite) if not site_exists: if dbutils.create_site( s_url=darksite, s_uuid=uuid, s_type=site_type, s_source=dbsettings.Source.DISCOVERED): dbutils.set_site_current_processing_status( s_url=darksite, s_status=dbsettings.Status.DISCOVERING) except Exception: logging.exception( "ERROR: destination darksite %s is already created ", darksite) with db_session: # Linking dbutils.create_link(site, darksite) logging.debug("New link: %s --> %s", site, darksite) except Exception: logging.exception("ERROR: linking site %s", site) # This process should not be alive if site in list(alive_spiders.keys()): alive_spiders.pop(site) logging.debug("Removing %s from alive spiders.", site)
def process_fail(fail_spiders): ''' EN: It processes the files with ".fail" extension. SP: Procesa los ficheros con extensión ".fail". It deletes the files with the ".fail" extension from the /finished directory and adds the failed site to the pending_sites list so that the site can be crawled again. Elimina los ficheros con extensión ".fail" del directorio /finished y añade el site fallido a la lista pending_sites para que se vuelva a crawlear. ''' logging.info("Processing FAILED spiders ... ") files_to_remove = [] logging.debug("Starting to process FAILED spiders #%s: %s", len(fail_spiders), str(fail_spiders)) try: for fil in fail_spiders: files_to_remove.append(fil) eliminar = darknetsettings.PATH_ONGOING_SPIDERS + fil.replace( ".fail", ".json") #eliminar = eliminar.replace("__", "/") #Freenet Sites os.remove(eliminar) eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil #eliminar = eliminar.replace("__", "/") #Freenet Sites os.remove(eliminar) except Exception as e: logging.error("ERROR processing FAILED file - %s", e) logging.exception("ERROR:") finally: with db_session: for fil in files_to_remove: # If the crawling process failed, there was an ERROR site = fil.replace(".fail", "") site = site.replace("__", "/") dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.ERROR) logging.debug("Setting the ERROR status to site %s", site) # This process should not be alive if site in list(alive_spiders.keys()): alive_spiders.pop(site) logging.debug("Removing %s from alive spiders.", site) logging.debug("Ending to process FAILED spiders #%s: %s", len(fail_spiders), str(fail_spiders))
def add_prediscovering_sites(): # Gets initial seeds seed_sites = siteutils.get_seeds_from_file(darknetsettings.PATH_DATA + settings.INITIAL_SEEDS) # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created for site in seed_sites: site_type = siteutils.get_type_site(site) #if its a freesite, clear url if site_type.name is "FREENET": site = site.replace('https://', '') site = site.replace('http://', '') site = site.replace('freenet:', '') if site[-1] is '/': site = site[:-1] # is it a new site? Create it and set up the status to pending. if dbutils.create_site(s_url=site, s_type=site_type ,s_uuid=''): dbutils.set_site_current_processing_status(s_url=site, s_status=dbsettings.Status.PRE_DISCOVERING, add_processing_log=False)
def run_spider(site): """ Runs a spider :param site: str - the name of the site to be crawled :return: p: Popen - The subprocess status """ # TODO each spider process should be better monitored. Maybe launching them in separated threads. p = None try: with db_session: # Setting up the correct status dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.ONGOING) # Increasing tries siteEntity = dbutils.increase_tries_on_error(s_url=site) # Get Type of site siteType = siteutils.get_type_site(site=site) # Try running a spider #command = 'scrapy crawl i2p -a url="http://' + site + '"' #command = 'scrapy crawl freenet -a url="http://' + site + '"' command = 'scrapy crawl ' + siteType.name + ' -a url="http://' + site + '"' p = subprocess.Popen(shlex.split(command)) logging.debug("Command launched %s", shlex.split(command)) logging.debug("Process launched for %s with PID=%s, tries=%s", site, p.pid, siteEntity.error_tries) except Exception: logging.exception( "Spider of site %s could not be launched. Maybe it has already been launched." ) return p
def run_spider(site): """ Runs a spider :param site: str - the name of the site to be crawled :return: p: Popen - The subprocess status """ # Try running a spider param1 = "url=http://" + site param2 = "./i2p/spiders/ongoing/" + site + ".json" p = subprocess.Popen(["scrapy", "crawl", "i2p", "-a", param1, "-o", param2], shell=False) with db_session: # Create site if needed. dbutils.create_site(s_url=site) # Setting up the correct status dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.ONGOING) # Increasing tries siteEntity = dbutils.increase_tries(s_url=site) logging.debug("Running %s, tries=%s",site,siteEntity.crawling_tries) return p
def add_fake_discovery_info(): """ Adds default discovery info just for testing dicovering procedure """ valid_site = 'no.i2p' #dbutils.create_site(valid_site) #dbutils.set_site_current_processing_status(s_url=valid_site, s_status=dbsettings.Status.DISCOVERING) not_valid_site = 'fake.i2p' dbutils.create_site(not_valid_site) dbutils.set_site_current_processing_status(s_url=not_valid_site, s_status=dbsettings.Status.DISCOVERING) not_valid_site_2 = 'fake_2.i2p' dbutils.create_site(not_valid_site_2) dbutils.set_site_current_processing_status(s_url=not_valid_site_2, s_status=dbsettings.Status.DISCOVERING) not_valid_site_3 = 'fake_3.i2p' dbutils.create_site(not_valid_site_3) dbutils.set_site_current_processing_status(s_url=not_valid_site_3, s_status=dbsettings.Status.DISCOVERING)
def main(): ''' EN: It controls the whole process of the crawling through a loop that is repeated every 2 seconds. SP: Controla todo el proceso del crawling mediante un bucle que se repite cada 2 segundos. Every second it enters the main loop (if there are still sites to visit or sites that are been visited) to crawl all the sites. Finally, the extracted info is added to the database and the json file that will be used for web visualitation of the node map is generated. Cada segundo se entra en el bucle principal (si quedan sitios por visitar o se están visitando) para crawlear todos los sites. Finalmente, la información extraída se añade a la base de datos y se genera el archivo json que se utilizará para la visulación web del mapa de nodos. ''' log = logging.getLogger('') log.setLevel(logging.DEBUG) format = logging.Formatter( '%(asctime)s %(levelname)s - %(threadName)s - mod: %(module)s, method: %(funcName)s, msg: %(message)s' ) fhall = RotatingFileHandler( darknetsettings.PATH_LOG + "darknetcrawler.log", maxBytes=0, backupCount=0) # NO rotation, neither by size, nor by number of files fhall.setFormatter(format) fhall.setLevel(logging.DEBUG) fherror = RotatingFileHandler( darknetsettings.PATH_LOG + "darkneterror.log", maxBytes=0, backupCount=0) # NO rotation, neither by size, nor by number of files fherror.setFormatter(format) fherror.setLevel(logging.ERROR) log.addHandler(fhall) log.addHandler(fherror) logging.info("Starting Darknet crawling ... ") # Generating UUID for the crawling process global uuid uuid = set_uuid('uuid.txt') try: # Try to assign N seeds sites to me set_seeds(settings.INITIAL_SEEDS_BACH_SIZE) # Create all sites in DISCOVERING status obtained from floodfill seeds. get_sites_from_floodfill() # Restoring the crawling status status = siteutils.get_crawling_status(uuid) # restored pending sites pending_sites = status[dbsettings.Status.PENDING.name] # restored ongoing sites ongoing_sites = status[dbsettings.Status.ONGOING.name] # restored error sites error_sites = status[dbsettings.Status.ERROR.name] # restored discovering sites discovering_sites = status[dbsettings.Status.DISCOVERING.name] logging.debug("Restoring %s ERROR sites.", len(error_sites)) # Getting error sites and setting up them to pending to be crawled again. error_to_pending(error_sites, pending_sites) logging.debug("Restoring %s PENDING sites.", len(pending_sites)) logging.debug("Restoring %s ONGOING sites.", len(ongoing_sites)) logging.debug("Restoring %s DISCOVERING sites.", len(discovering_sites)) # restored ONGOING SITES should be launched for site in ongoing_sites: if len(ongoing_sites) <= settings.MAX_ONGOING_SPIDERS: logging.debug("Starting spider for %s.", site) p = run_spider(site) # To monitor all the running spiders if p and not p.returncode: # if successfully launched alive_spiders[site] = p # discoverying thread logging.debug("Running discovering process ...") dThread = discoverythread.DiscoveringThread( settings.MAX_CRAWLING_ATTEMPTS_ON_DISCOVERING, settings.MAX_DURATION_ON_DISCOVERING, settings.MAX_SINGLE_THREADS_ON_DISCOVERING, settings.HTTP_TIMEOUT, uuid) dThread.setName('DiscoveryThread') dThread.start() # Timestamp to compute time to next seed self-assignment initial = datetime.now() # main loop while pending_sites or ongoing_sites or discovering_sites: # Try to run another site if len(ongoing_sites) < settings.MAX_ONGOING_SPIDERS: if pending_sites: with db_session: site = pending_sites.pop() if dbutils.get_site( s_url=site ).error_tries < settings.MAX_CRAWLING_ATTEMPTS_ON_ERROR: logging.debug("Starting spider for %s.", site) p = run_spider(site) # To monitor all the running spiders if p and not p.returncode: # if successfully launched alive_spiders[site] = p else: logging.debug( "The site %s cannot be crawled because the number of max_tries on ERROR " "status has been reached.", site) logging.debug( "Setting up the DISCOVERING status to %s", site) # The site dbutils.set_site_current_processing_status( s_url=site, s_status=dbsettings.Status.DISCOVERING) dbutils.reset_tries_on_error(s_url=site) # Polling how the crawling of spiders is going ... check_crawling_status() # Checking spiders status coherence between DB and the launched processes. check_spiders_status(uuid) # Each settings.SEEDS_ASSIGNMENT_PERIOD I try to self-assign seeds if (datetime.now() - initial) > timedelta( seconds=settings.SEEDS_ASSIGNMENT_PERIOD): set_seeds(settings.INITIAL_SEEDS_BACH_SIZE) initial = datetime.now() # Adding new sites to DISCOVERING status obtained from floodfill seeds. get_sites_from_floodfill() # Get current status status = siteutils.get_crawling_status(uuid) pending_sites = status[dbsettings.Status.PENDING.name] ongoing_sites = status[dbsettings.Status.ONGOING.name] error_sites = status[dbsettings.Status.ERROR.name] error_defunc = status[dbsettings.Status.ERROR_DEFUNC.name] discarded_sites = status[dbsettings.Status.DISCARDED.name] finished_sites = status[dbsettings.Status.FINISHED.name] discovering_sites = status[dbsettings.Status.DISCOVERING.name] # Getting error sites and setting up them to pending to be crawled again. error_to_pending(error_sites, pending_sites) logging.debug( "Stats --> ONGOING %s, PENDING %s, FINISHED %s, ERROR %s, ERROR_DEFUNC %s, DISCOVERING %s," " DISCARDED %s", len(ongoing_sites), len(pending_sites), len(finished_sites), len(error_sites), len(error_defunc), len(discovering_sites), len(discarded_sites)) time.sleep(1) except KeyboardInterrupt: logging.exception("KeyboardInterrupt received ...") except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) logging.exception("ERROR: not controlled exception.") finally: logging.info("Stopping all services ...") try: if isinstance(dThread, discoverythread.DiscoveringThread): dThread.stop() except UnboundLocalError: logging.warning( "DiscoveringThread is not running, so it will not stopped.") for i in threading.enumerate(): if i is not threading.currentThread(): logging.debug("Waiting for %s thread ...", i.name) i.join() logging.info("Exiting ...") sys.exit(1)
def process_ok(ok_spiders): ''' EN: It processes the files with ".ok" extension. SP: Procesa los ficheros con extensión ".ok". It moves the ".json" files of the sites that have been crawled correctly (.ok) from the /ongoing directory to the /finished directory, opens said ".json" files, calls the link_darksites() function in order to add the pertinent data to database, adds the sites that haven't been visited yet to the pending_sites and deletes the ".ok" files once processed. Mueve los ficheros ".json" de los sites que han sido crawleados correctamente (.ok) del directorio /ongoing al directorio /finished, abre dichos ficheros ".json", llama a la función link_darksites() para añadir los datos pertinentes a la base de datos, añade a pending_sites los sites que no se hayan visitado y borra los ficheros ".ok" una vez procesados. ''' logging.info("Processing OK spiders ...") logging.debug("Starting to process OK spiders #%s: %s", len(ok_spiders), str(ok_spiders)) # Used in case of error to setup in BBDD as ERROR current_site_name = None for fil in ok_spiders: try: current_site_name = fil.replace(".ok", "") current_site_name = current_site_name.replace("__", "/") #Freenet Sites fil_json_extension = fil.replace(".ok", ".json") source = darknetsettings.PATH_ONGOING_SPIDERS + fil_json_extension target = darknetsettings.PATH_FINISHED_SPIDERS + fil_json_extension shutil.move(source, target) # Once a site has been crawled, what we only need is the extracted darksite which are at the end of the # json file #last_lines = siteutils.tail(target, n=2) #last_lines = last_lines.replace('\n]','') with open(target, 'r') as f: crawled_items = json.loads(f.readline()) crawled_darksites = crawled_items["extracted_darksites"] logging.debug("Extracted darksites from %s: %s", fil, str(crawled_darksites)) with db_session: # setting up the language set_site_language(current_site_name, crawled_items["language"]) # setting up the home site info text = ' '.join(crawled_items["main_page_tokenized_words"]) set_site_home_info(current_site_name, crawled_items["size_main_page"], crawled_items["title"][0], text) # moved here to handle the status of crawled darksites link_darksites(current_site_name, crawled_darksites) # setting up connectivity summary # TODO this method should be called separately once the crawling process finished to get real values of in, out and degree set_site_connectivity_summary( current_site_name, crawled_items["total_darksite_pages"]) # setting up the number of pages to the site. set_site_number_pages(current_site_name, crawled_items["total_darksite_pages"]) except Exception as e: logging.error("ERROR processing OK file %s - %s", current_site_name, e) logging.exception("ERROR:") # If an error is raised, this site should be tagged as ERROR with db_session: dbutils.set_site_current_processing_status( s_url=current_site_name, s_status=dbsettings.Status.ERROR) # This process should not be alive if current_site_name in list(alive_spiders.keys()): alive_spiders.pop(current_site_name) logging.debug("Removing %s from alive spiders.", current_site_name) # removing the JSON file for the site which causes the error. eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil_json_extension os.remove(eliminar) # Delete *.ok files in finished folder for fil in ok_spiders: eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil os.remove(eliminar) logging.debug("Deleting OK file %s", fil) logging.debug("Ending to process OK spiders")
def run(self): try: with db_session: # Get next site # darksite = self._sites_to_discover.pop() logging.debug("Trying to discover site %s", self._darksite) proc_log = dbutils.get_processing_logs_by_site_status( s_url=self._darksite, s_status=dbsettings.Status.DISCOVERING) # Computes the time spent from the first discovering status of the darksite time_spent = datetime.now() - proc_log[0].timestamp # Do I reach schedule time to start the discovering process? time_to_next_try = proc_log[ -1].next_time_to_try - datetime.now() discovering_tries = dbutils.get_site( s_url=self._darksite).discovering_tries logging.debug("Time spent for site %s: %s ", self._darksite, time_spent) logging.debug( "Next discovering try for site %s: %s. Time to it: %s", self._darksite, proc_log[-1].next_time_to_try, time_to_next_try) logging.debug("Current tries for site %s: %s ", self._darksite, discovering_tries) if time_to_next_try <= timedelta(minutes=0): # Checking maximum discovering tries and period of time for trying if discovering_tries < self._max_tries \ and time_spent <= timedelta(minutes=self._duration): darksite_http = "http://" + self._darksite logging.debug("DISCOVERING: %s", self._darksite) if not connection_settings.PROXY: response = request_conn.connectThroughProxy( darksite_http, proxies=None, timeout=self._http_request_timeout) else: response = request_conn.connectThroughProxy( darksite_http, proxies={ 'http': 'http://' + connection_settings.PROXY }, timeout=self._http_request_timeout) response_code = str(response.status_code) response_time = str(response.elapsed.total_seconds()) # Print CSV Line csv_line = "" csv_line += self._darksite + "|" + response_code + "|" csv_line += response_time + "|" + str( discovering_tries) logging.debug("RESPONSE: %s", csv_line) logging.debug( "Increasing discovering tries to site %s.", self._darksite) dbutils.increase_tries_on_discovering( s_url=self._darksite) # HTTP 2XX or 3XX if reg_http.match(response_code): dbutils.set_site_current_processing_status( s_url=self._darksite, s_http_status=response_code, s_http_response_time=response_time, s_status=dbsettings.Status.PENDING) logging.debug("Site %s was set up to PENDING.", self._darksite) # HTTP 4XX or 5XX else: dbutils.set_site_current_processing_status( s_url=self._darksite, s_http_status=response_code, s_http_response_time=response_time, s_status=dbsettings.Status.DISCOVERING) logging.debug( "Site %s was set up to DISCOVERING the response code %s received.", self._darksite, response_code) else: dbutils.set_site_current_processing_status( s_url=self._darksite, s_status=dbsettings.Status.DISCARDED) logging.debug( "Site %s was set up to DISCARDED because tries were %s (max %s) " "or duration was %s (max %s mins).", self._darksite, discovering_tries, self._max_tries, time_spent, self._duration) else: logging.debug( "Site %s is not ready to be discover until %s. Time to it: %s", self._darksite, proc_log[0].next_time_to_try, time_to_next_try) except Exception as e: logging.error("ERROR on discovering %s: %s", self._darksite, e) logging.exception("ERROR:") logging.debug("Increasing discovering tries to site %s.", self._darksite) with db_session: dbutils.increase_tries_on_discovering(s_url=self._darksite) dbutils.set_site_current_processing_status( s_url=self._darksite, s_http_status='', s_http_response_time='', s_status=dbsettings.Status.DISCOVERING) logging.debug( "Site %s was set up to DISCOVERING because there was an HTTP error.", self._darksite)
def main(): ''' EN: It controls the whole process of the crawling through a loop that is repeated every 2 seconds. SP: Controla todo el proceso del crawling mediante un bucle que se repite cada 2 segundos. Every second it enters the main loop (if there are still sites to visit or sites that are been visited) to crawl all the sites. Finally, the extracted info is added to the database and the json file that will be used for web visualitation of the node map is generated. Cada segundo se entra en el bucle principal (si quedan sitios por visitar o se están visitando) para crawlear todos los sites. Finalmente, la información extraída se añade a la base de datos y se genera el archivo json que se utilizará para la visulación web del mapa de nodos. ''' log = logging.getLogger('') log.setLevel(logging.DEBUG) format = logging.Formatter('%(asctime)s %(levelname)s - %(threadName)s - mod: %(module)s, method: %(funcName)s, msg: %(message)s') ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = RotatingFileHandler("registro.log", maxBytes=0, backupCount=0) # NO rotation, neither by size, nor by number of files fh.setFormatter(format) log.addHandler(fh) logging.debug("Dentro de main()") # run_spider("stats.i2p") # time.sleep(60) # exit() # Gets initial seeds seed_sites = siteutils.get_initial_seeds("../../data/seed_urls.txt") # Create all sites with PENDING status. Note that if the site exists, it will not be created with db_session: for site in seed_sites: # is it a new site? Create it and set up the status to pending. if dbutils.create_site(s_url=site): dbutils.set_site_current_processing_status(s_url=site,s_status=settings.Status.PENDING) # Restore processing status with db_session: # Restore previous crawling process status # restored pending sites pending_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.PENDING) # restored ongoing sites ongoing_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ONGOING) # restored error sites error_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ERROR) logging.debug("Restoring %s ERROR sites.", len(error_sites)) # Error sites should be tagged as pending sites. for site in error_sites: if site not in pending_sites: # they should not be in PENDING status;) with db_session: if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries: logging.debug("The site %s has been restored. New status PENDING.", site) pending_sites.insert(0, site) # sets up the error site to pending status dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.PENDING) else: logging.debug("The site %s cannot be crawled because the number of max_tries has been reached.", site) # The site cannot be crawled dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN) logging.debug("Restoring %s PENDING sites.", len(pending_sites)) logging.debug("Restoring %s ONGOING sites.", len(ongoing_sites)) # restored ONGOING SITES should be launched for site in ongoing_sites: if len(ongoing_sites) <= max_ongoing_spiders: logging.debug("Starting spider for %s.", site) run_spider(site) # Monitoring time stime = time.time() etime = time.time() # main loop while pending_sites or ongoing_sites: # Try to run another site if len(ongoing_sites) <= max_ongoing_spiders: if pending_sites: with db_session: site = pending_sites.pop() if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries: logging.debug("Starting spider for %s.", site) run_spider(site) else: logging.debug("The site %s cannot be crawled.", site) # The site cannot be crawled dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN) # Polling spiders status check() time.sleep(1) if (etime - stime) < 60: etime = time.time() else: stime = time.time() etime = time.time() # Update the status with db_session: pending_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.PENDING) ongoing_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ONGOING) error_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ERROR) unknown_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.UNKNOWN) finished_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.FINISHED) # Error sites should be tagged as pending sites. for site in error_sites: if site not in pending_sites: with db_session: if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries: logging.debug("The site %s has been restored. New status PENDING.", site) pending_sites.insert(0, site) # sets up the error site to pending status dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.PENDING) else: logging.debug("The site %s cannot be crawled because the number of max_tries has been reached.", site) # The site cannot be crawled dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN) logging.debug("Stats --> ONGOING %s, PENDING %s, FINISHED %s, ERROR %s, UNKNOWN %s", \ len(ongoing_sites), len(pending_sites), len(finished_sites), len(error_sites), len(unknown_sites))