Esempio n. 1
0
def check_spiders_status(uuid):
    """
    Checks the status integrity among the launched scrapy sub-processes and their status in DB. It is preventing from
    zombie/defunc processes that never update their status in DB, remaining in ONGOING forever.

    Differences between status in DB and the real alive spider processes, tell us what were the crashed spiders.

    :param uuid: str - Crawling process UUID
    """

    with db_session:
        ongoing_db_sites = dbutils.get_sites_names_by_processing_status(
            dbsettings.Status.ONGOING, uuid)

        logging.debug(
            "There are %s ongoing sites in db and %s alive spider processes.",
            len(ongoing_db_sites), len(alive_spiders))

        for site in ongoing_db_sites:
            logging.debug("Current alive spiders %s",
                          list(alive_spiders.keys()))
            p_status = psutil.Process(alive_spiders[site].pid).status()
            logging.debug("Spider/Site %s is %s.", site, p_status)
            del p_status
            # Is it not running?
            if (site not in list(
                    alive_spiders.keys())) or (alive_spiders[site].poll()
                                               is not None):
                dbutils.set_site_current_processing_status(
                    s_status=dbsettings.Status.ERROR_DEFUNC, s_url=site)
                alive_spiders.pop(site)
                logging.debug("Site %s has been set up to ERROR_DEFUNC", site)
Esempio n. 2
0
def set_seeds(n_seeds):
    """
    Try to assign to this manager a number of seeds in PRE_DISCOVERING status.

    :param n_seeds: int - Number of seeds to be assigned
    """

    with db_session:
        # Gets all initial seeds
        seed_sites = dbutils.get_sites_names_by_processing_status(
            dbsettings.Status.PRE_DISCOVERING, uuid='')

    logging.debug("There are %s seeds sites.", len(seed_sites))

    # Get the first n seeds
    seed_sites = seed_sites[:n_seeds]

    # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created
    for site in seed_sites:
        try:
            with db_session:
                # is it a new site? Create it and set up the status to pending.
                if dbutils.update_seed_site(s_url=site, s_uuid=uuid):
                    dbutils.set_site_current_processing_status(
                        s_url=site, s_status=dbsettings.Status.DISCOVERING)
        except Exception:
            logging.exception(
                "ERROR: site %s could not be assigned to me. Maybe it is already managed by other"
                " manager.", site)
Esempio n. 3
0
def error_to_pending(error_sites, pending_sites):
    """
    ERROR sites are set up to PENDING if the max crawling tries are not exceeded.

    :param error_sites: list - List of current ERROR sites
    :param pending_sites: list - List of current PENDING sites
    """

    # Error sites should be tagged as pending sites.
    with db_session:
        for site in error_sites:
            if dbutils.get_site(
                    s_url=site
            ).error_tries < settings.MAX_CRAWLING_ATTEMPTS_ON_ERROR:
                logging.debug(
                    "The site %s has been restored. New status PENDING.", site)
                pending_sites.insert(0, site)
                # sets up the error site to pending status
                dbutils.set_site_current_processing_status(
                    s_url=site, s_status=dbsettings.Status.PENDING)
            else:
                logging.debug(
                    "The site %s cannot be crawled because the number of max_tries on ERROR status has been reached.",
                    site)
                logging.debug("Setting up the DISCOVERING status to %s", site)
                # The site cannot be crawled
                dbutils.set_site_current_processing_status(
                    s_url=site, s_status=dbsettings.Status.DISCOVERING)
                dbutils.reset_tries_on_error(s_url=site)
Esempio n. 4
0
def get_sites_from_floodfill():
    """
    Creates new sites from floodfill sites

    """

    # Gets initial seeds
    seed_sites = siteutils.get_seeds_from_file(darknetsettings.PATH_DATA +
                                               "floodfill_seeds.txt")

    logging.debug("There are %s floodfill sites.", len(seed_sites))

    # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created
    for site in seed_sites:
        try:
            with db_session:
                # is it a new site? Create it and set up the status to pending.
                site_type = siteutils.get_type_site(site)
                if dbutils.create_site(s_url=site,
                                       s_uuid=uuid,
                                       s_type=site_type,
                                       s_source=dbsettings.Source.FLOODFILL):
                    dbutils.set_site_current_processing_status(
                        s_url=site, s_status=dbsettings.Status.DISCOVERING)
        except Exception:
            logging.exception("ERROR: site %s could not be created.", site)
Esempio n. 5
0
def process_fail():
    '''
    EN: It processes the files with ".fail" extension.
    SP: Procesa los ficheros con extensión ".fail".

    It deletes the files with the ".fail" extension from the /finished directory and adds the failed site
    to the pending_sites list so that the site can be crawled again.
    Elimina los ficheros con extensión ".fail" del directorio /finished y añade el site fallido a la lista
    pending_sites para que se vuelva a crawlear.
    '''
    logging.debug("Dentro de process_fail()")
    global fail_files
    files_to_remove = []
    logging.debug("Fail_files antes del bucle: " + str(fail_files))
    try:
        for fil in fail_files:
            files_to_remove.append(fil)
            eliminar = "i2p/spiders/ongoing/" + fil.replace(".fail", ".json")
            os.remove(eliminar)
            eliminar = "i2p/spiders/finished/" + fil
            os.remove(eliminar)

            # If the crawling process failed, there was an ERROR
            site = fil.replace(".fail","")
            with db_session:
                dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.ERROR)

            logging.debug("Setting the ERROR status to site %s",site)

    except Exception as e:
        logging.error("There has been some error with the files")
    finally:
        for i in files_to_remove:
            fail_files.remove(i)
        logging.debug("Fail_files despues del bucle: " + str(fail_files))
Esempio n. 6
0
def add_to_database(site, targeted_sites):
    '''
    EN: It adds the extracted data by the crawler to the database.
    SP: Añade los datos extraídos por el crawler a la base de datos.

    :param site: site in question to add to the database / site en cuestión a añadir a la base de datos
    :param targeted_sites: sites to which the site points at / sitios a los que el site apunta
    '''
    logging.debug("Dentro de add_to_database()")

    try:
        with db_session:

            # Creates the src site, if needed
            dbutils.create_site(site)
            dbutils.set_site_current_processing_status(s_url=site,s_status=settings.Status.FINISHED)

            for eepsite in targeted_sites:

                # is it a new site? Create it and set up the status to pending.
                if dbutils.create_site(s_url=eepsite):
                    dbutils.set_site_current_processing_status(s_url=eepsite, s_status=settings.Status.PENDING)

                # Linking
                dbutils.create_link(site, eepsite)

    except Exception as e:
        logging.error("Something was wrong with the database")
        raise e
Esempio n. 7
0
def link_darksites(site, targeted_sites):
    '''
    EN: It adds the extracted data by the crawler to the database.
    SP: Añade los datos extraídos por el crawler a la base de datos.

    :param site: site in question to add to the database / site en cuestión a añadir a la base de datos
    :param targeted_sites: sites to which the site points at / sitios a los que el site apunta
    '''
    logging.debug("Linking %s to %s ", site, targeted_sites)

    try:
        with db_session:

            dbutils.set_site_current_processing_status(
                s_url=site, s_status=dbsettings.Status.FINISHED)
            logging.debug("Site %s was setup to FINISHED.", site)

        for darksite in targeted_sites:
            try:
                with db_session:
                    site_type = siteutils.get_type_site(darksite)
                    # is it a new site? Create it and set up the status to pending.
                    site_exists = False
                    if site_type.name is "FREENET" and ("USK@" in darksite
                                                        or "SSK@" in darksite):
                        site_exists = siteutils.compare_freesite(darksite)
                    if not site_exists:
                        if dbutils.create_site(
                                s_url=darksite,
                                s_uuid=uuid,
                                s_type=site_type,
                                s_source=dbsettings.Source.DISCOVERED):
                            dbutils.set_site_current_processing_status(
                                s_url=darksite,
                                s_status=dbsettings.Status.DISCOVERING)
            except Exception:
                logging.exception(
                    "ERROR: destination darksite %s is already created ",
                    darksite)

            with db_session:
                # Linking
                dbutils.create_link(site, darksite)

            logging.debug("New link: %s --> %s", site, darksite)

    except Exception:
        logging.exception("ERROR: linking site %s", site)

    # This process should not be alive
    if site in list(alive_spiders.keys()):
        alive_spiders.pop(site)
        logging.debug("Removing %s from alive spiders.", site)
Esempio n. 8
0
def process_fail(fail_spiders):
    '''
    EN: It processes the files with ".fail" extension.
    SP: Procesa los ficheros con extensión ".fail".

    It deletes the files with the ".fail" extension from the /finished directory and adds the failed site
    to the pending_sites list so that the site can be crawled again.
    Elimina los ficheros con extensión ".fail" del directorio /finished y añade el site fallido a la lista
    pending_sites para que se vuelva a crawlear.
    '''
    logging.info("Processing FAILED spiders ... ")

    files_to_remove = []
    logging.debug("Starting to process FAILED spiders #%s: %s",
                  len(fail_spiders), str(fail_spiders))
    try:
        for fil in fail_spiders:
            files_to_remove.append(fil)
            eliminar = darknetsettings.PATH_ONGOING_SPIDERS + fil.replace(
                ".fail", ".json")
            #eliminar = eliminar.replace("__", "/") #Freenet Sites
            os.remove(eliminar)
            eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil
            #eliminar = eliminar.replace("__", "/") #Freenet Sites
            os.remove(eliminar)

    except Exception as e:
        logging.error("ERROR processing FAILED file - %s", e)
        logging.exception("ERROR:")

    finally:
        with db_session:
            for fil in files_to_remove:
                # If the crawling process failed, there was an ERROR
                site = fil.replace(".fail", "")
                site = site.replace("__", "/")
                dbutils.set_site_current_processing_status(
                    s_url=site, s_status=dbsettings.Status.ERROR)
                logging.debug("Setting the ERROR status to site %s", site)
                # This process should not be alive
                if site in list(alive_spiders.keys()):
                    alive_spiders.pop(site)
                    logging.debug("Removing %s from alive spiders.", site)

        logging.debug("Ending to process FAILED spiders #%s: %s",
                      len(fail_spiders), str(fail_spiders))
Esempio n. 9
0
def add_prediscovering_sites():

    # Gets initial seeds
    seed_sites = siteutils.get_seeds_from_file(darknetsettings.PATH_DATA + settings.INITIAL_SEEDS)

    # Create all sites in DISCOVERING status. Note that if the site exists, it will not be created
    for site in seed_sites:
        site_type = siteutils.get_type_site(site)

        #if its a freesite, clear url
        if site_type.name is "FREENET":
            site = site.replace('https://', '')
            site = site.replace('http://', '')
            site = site.replace('freenet:', '')
            if site[-1] is '/':
                site = site[:-1]

        # is it a new site? Create it and set up the status to pending.
        if dbutils.create_site(s_url=site, s_type=site_type ,s_uuid=''):
            dbutils.set_site_current_processing_status(s_url=site, s_status=dbsettings.Status.PRE_DISCOVERING,
                                                       add_processing_log=False)
Esempio n. 10
0
def run_spider(site):
    """
    Runs a spider

    :param site: str - the name of the site to be crawled
    :return: p: Popen - The subprocess status
    """

    # TODO each spider process should be better monitored. Maybe launching them in separated threads.

    p = None

    try:

        with db_session:
            # Setting up the correct status
            dbutils.set_site_current_processing_status(
                s_url=site, s_status=dbsettings.Status.ONGOING)
            # Increasing tries
            siteEntity = dbutils.increase_tries_on_error(s_url=site)
            # Get Type of site
            siteType = siteutils.get_type_site(site=site)

        # Try running a spider
        #command = 'scrapy crawl i2p -a url="http://' + site + '"'
        #command = 'scrapy crawl freenet -a url="http://' + site + '"'
        command = 'scrapy crawl ' + siteType.name + ' -a url="http://' + site + '"'
        p = subprocess.Popen(shlex.split(command))

        logging.debug("Command launched %s", shlex.split(command))
        logging.debug("Process launched for %s with PID=%s, tries=%s", site,
                      p.pid, siteEntity.error_tries)

    except Exception:
        logging.exception(
            "Spider of site %s could not be launched. Maybe it has already been launched."
        )

    return p
Esempio n. 11
0
def run_spider(site):
    """
    Runs a spider

    :param site: str - the name of the site to be crawled
    :return: p: Popen - The subprocess status
    """

    # Try running a spider
    param1 = "url=http://" + site
    param2 = "./i2p/spiders/ongoing/" + site + ".json"
    p = subprocess.Popen(["scrapy", "crawl", "i2p", "-a", param1, "-o", param2], shell=False)

    with db_session:
        # Create site if needed.
        dbutils.create_site(s_url=site)
        # Setting up the correct status
        dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.ONGOING)
        # Increasing tries
        siteEntity = dbutils.increase_tries(s_url=site)

        logging.debug("Running %s, tries=%s",site,siteEntity.crawling_tries)

    return p
Esempio n. 12
0
def add_fake_discovery_info():
    """
    Adds default discovery info just for testing dicovering procedure
    """

    valid_site = 'no.i2p'
    #dbutils.create_site(valid_site)
    #dbutils.set_site_current_processing_status(s_url=valid_site, s_status=dbsettings.Status.DISCOVERING)

    not_valid_site = 'fake.i2p'
    dbutils.create_site(not_valid_site)
    dbutils.set_site_current_processing_status(s_url=not_valid_site, s_status=dbsettings.Status.DISCOVERING)

    not_valid_site_2 = 'fake_2.i2p'
    dbutils.create_site(not_valid_site_2)
    dbutils.set_site_current_processing_status(s_url=not_valid_site_2, s_status=dbsettings.Status.DISCOVERING)

    not_valid_site_3 = 'fake_3.i2p'
    dbutils.create_site(not_valid_site_3)
    dbutils.set_site_current_processing_status(s_url=not_valid_site_3, s_status=dbsettings.Status.DISCOVERING)
Esempio n. 13
0
def main():
    '''
    EN: It controls the whole process of the crawling through a loop that is repeated every 2 seconds.
    SP: Controla todo el proceso del crawling mediante un bucle que se repite cada 2 segundos.

    Every second it enters the main loop (if there are still sites to visit or sites that are been visited) to crawl all the sites.
    Finally, the extracted info is added to the database and the json file that will be used for web visualitation of the node map is generated.
    Cada segundo se entra en el bucle principal (si quedan sitios por visitar o se están visitando) para crawlear todos los sites.
    Finalmente, la información extraída se añade a la base de datos y se genera el archivo json que se utilizará para la visulación web del mapa de nodos.
    '''

    log = logging.getLogger('')
    log.setLevel(logging.DEBUG)
    format = logging.Formatter(
        '%(asctime)s %(levelname)s - %(threadName)s - mod: %(module)s, method: %(funcName)s, msg: %(message)s'
    )

    fhall = RotatingFileHandler(
        darknetsettings.PATH_LOG + "darknetcrawler.log",
        maxBytes=0,
        backupCount=0)  # NO rotation, neither by size, nor by number of files
    fhall.setFormatter(format)
    fhall.setLevel(logging.DEBUG)

    fherror = RotatingFileHandler(
        darknetsettings.PATH_LOG + "darkneterror.log",
        maxBytes=0,
        backupCount=0)  # NO rotation, neither by size, nor by number of files
    fherror.setFormatter(format)
    fherror.setLevel(logging.ERROR)

    log.addHandler(fhall)
    log.addHandler(fherror)

    logging.info("Starting Darknet crawling ... ")

    # Generating UUID for the crawling process
    global uuid
    uuid = set_uuid('uuid.txt')

    try:

        # Try to assign N seeds sites to me
        set_seeds(settings.INITIAL_SEEDS_BACH_SIZE)

        # Create all sites in DISCOVERING status obtained from floodfill seeds.
        get_sites_from_floodfill()

        # Restoring the crawling status
        status = siteutils.get_crawling_status(uuid)
        # restored pending sites
        pending_sites = status[dbsettings.Status.PENDING.name]
        # restored ongoing sites
        ongoing_sites = status[dbsettings.Status.ONGOING.name]
        # restored error sites
        error_sites = status[dbsettings.Status.ERROR.name]
        # restored discovering sites
        discovering_sites = status[dbsettings.Status.DISCOVERING.name]

        logging.debug("Restoring %s ERROR sites.", len(error_sites))

        # Getting error sites and setting up them to pending to be crawled again.
        error_to_pending(error_sites, pending_sites)

        logging.debug("Restoring %s PENDING sites.", len(pending_sites))
        logging.debug("Restoring %s ONGOING sites.", len(ongoing_sites))
        logging.debug("Restoring %s DISCOVERING sites.",
                      len(discovering_sites))

        # restored ONGOING SITES should be launched
        for site in ongoing_sites:
            if len(ongoing_sites) <= settings.MAX_ONGOING_SPIDERS:
                logging.debug("Starting spider for %s.", site)
                p = run_spider(site)
                # To monitor all the running spiders
                if p and not p.returncode:  # if successfully launched
                    alive_spiders[site] = p

        # discoverying thread
        logging.debug("Running discovering process ...")
        dThread = discoverythread.DiscoveringThread(
            settings.MAX_CRAWLING_ATTEMPTS_ON_DISCOVERING,
            settings.MAX_DURATION_ON_DISCOVERING,
            settings.MAX_SINGLE_THREADS_ON_DISCOVERING, settings.HTTP_TIMEOUT,
            uuid)
        dThread.setName('DiscoveryThread')
        dThread.start()

        # Timestamp to compute time to next seed self-assignment
        initial = datetime.now()

        # main loop
        while pending_sites or ongoing_sites or discovering_sites:

            # Try to run another site
            if len(ongoing_sites) < settings.MAX_ONGOING_SPIDERS:
                if pending_sites:
                    with db_session:
                        site = pending_sites.pop()
                        if dbutils.get_site(
                                s_url=site
                        ).error_tries < settings.MAX_CRAWLING_ATTEMPTS_ON_ERROR:
                            logging.debug("Starting spider for %s.", site)
                            p = run_spider(site)
                            # To monitor all the running spiders
                            if p and not p.returncode:  # if successfully launched
                                alive_spiders[site] = p
                        else:
                            logging.debug(
                                "The site %s cannot be crawled because the number of max_tries on ERROR "
                                "status has been reached.", site)
                            logging.debug(
                                "Setting up the DISCOVERING status to %s",
                                site)
                            # The site
                            dbutils.set_site_current_processing_status(
                                s_url=site,
                                s_status=dbsettings.Status.DISCOVERING)
                            dbutils.reset_tries_on_error(s_url=site)

            # Polling how the crawling of spiders is going ...
            check_crawling_status()

            # Checking spiders status coherence between DB and the launched processes.
            check_spiders_status(uuid)

            # Each settings.SEEDS_ASSIGNMENT_PERIOD I try to self-assign seeds
            if (datetime.now() - initial) > timedelta(
                    seconds=settings.SEEDS_ASSIGNMENT_PERIOD):
                set_seeds(settings.INITIAL_SEEDS_BACH_SIZE)
                initial = datetime.now()

            # Adding new sites to DISCOVERING status obtained from floodfill seeds.
            get_sites_from_floodfill()

            # Get current status
            status = siteutils.get_crawling_status(uuid)
            pending_sites = status[dbsettings.Status.PENDING.name]
            ongoing_sites = status[dbsettings.Status.ONGOING.name]
            error_sites = status[dbsettings.Status.ERROR.name]
            error_defunc = status[dbsettings.Status.ERROR_DEFUNC.name]
            discarded_sites = status[dbsettings.Status.DISCARDED.name]
            finished_sites = status[dbsettings.Status.FINISHED.name]
            discovering_sites = status[dbsettings.Status.DISCOVERING.name]

            # Getting error sites and setting up them to pending to be crawled again.
            error_to_pending(error_sites, pending_sites)

            logging.debug(
                "Stats --> ONGOING %s, PENDING %s, FINISHED %s, ERROR %s, ERROR_DEFUNC %s, DISCOVERING %s,"
                " DISCARDED %s", len(ongoing_sites), len(pending_sites),
                len(finished_sites), len(error_sites), len(error_defunc),
                len(discovering_sites), len(discarded_sites))

            time.sleep(1)

    except KeyboardInterrupt:
        logging.exception("KeyboardInterrupt received ...")
    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_exception(exc_type,
                                  exc_value,
                                  exc_traceback,
                                  limit=5,
                                  file=sys.stdout)
        logging.exception("ERROR: not controlled exception.")
    finally:
        logging.info("Stopping all services ...")

        try:
            if isinstance(dThread, discoverythread.DiscoveringThread):
                dThread.stop()
        except UnboundLocalError:
            logging.warning(
                "DiscoveringThread is not running, so it will not stopped.")

        for i in threading.enumerate():
            if i is not threading.currentThread():
                logging.debug("Waiting for %s thread ...", i.name)
                i.join()
        logging.info("Exiting ...")
        sys.exit(1)
Esempio n. 14
0
def process_ok(ok_spiders):
    '''
    EN: It processes the files with ".ok" extension.
    SP: Procesa los ficheros con extensión ".ok".

    It moves the ".json" files of the sites that have been crawled correctly (.ok) from the /ongoing directory to the /finished
    directory, opens said ".json" files, calls the link_darksites() function in order to add the pertinent data to database,
    adds the sites that haven't been visited yet to the pending_sites and deletes the ".ok" files once processed.
    Mueve los ficheros ".json" de los sites que han sido crawleados correctamente (.ok) del directorio /ongoing	al directorio
    /finished, abre dichos ficheros ".json", llama a la función link_darksites() para añadir los datos pertinentes a la
    base de datos, añade a pending_sites los sites que no se hayan visitado y borra los ficheros ".ok" una vez procesados.
    '''
    logging.info("Processing OK spiders ...")

    logging.debug("Starting to process OK spiders #%s: %s", len(ok_spiders),
                  str(ok_spiders))

    # Used in case of error to setup in BBDD as ERROR
    current_site_name = None

    for fil in ok_spiders:

        try:

            current_site_name = fil.replace(".ok", "")
            current_site_name = current_site_name.replace("__",
                                                          "/")  #Freenet Sites
            fil_json_extension = fil.replace(".ok", ".json")
            source = darknetsettings.PATH_ONGOING_SPIDERS + fil_json_extension
            target = darknetsettings.PATH_FINISHED_SPIDERS + fil_json_extension
            shutil.move(source, target)

            # Once a site has been crawled, what we only need is the extracted darksite which are at the end of the
            # json file
            #last_lines = siteutils.tail(target, n=2)
            #last_lines = last_lines.replace('\n]','')

            with open(target, 'r') as f:
                crawled_items = json.loads(f.readline())

            crawled_darksites = crawled_items["extracted_darksites"]
            logging.debug("Extracted darksites from %s: %s", fil,
                          str(crawled_darksites))

            with db_session:
                # setting up the language
                set_site_language(current_site_name, crawled_items["language"])

                # setting up the home site info
                text = ' '.join(crawled_items["main_page_tokenized_words"])
                set_site_home_info(current_site_name,
                                   crawled_items["size_main_page"],
                                   crawled_items["title"][0], text)

                # moved here to handle the status of crawled darksites
                link_darksites(current_site_name, crawled_darksites)

                # setting up connectivity summary
                # TODO this method should be called separately once the crawling process finished to get real values of in, out and degree
                set_site_connectivity_summary(
                    current_site_name, crawled_items["total_darksite_pages"])

                # setting up the number of pages to the site.
                set_site_number_pages(current_site_name,
                                      crawled_items["total_darksite_pages"])

        except Exception as e:
            logging.error("ERROR processing OK file %s - %s",
                          current_site_name, e)
            logging.exception("ERROR:")
            # If an error is raised, this site should be tagged as ERROR
            with db_session:
                dbutils.set_site_current_processing_status(
                    s_url=current_site_name, s_status=dbsettings.Status.ERROR)
                # This process should not be alive
                if current_site_name in list(alive_spiders.keys()):
                    alive_spiders.pop(current_site_name)
                    logging.debug("Removing %s from alive spiders.",
                                  current_site_name)

            # removing the JSON file for the site which causes the error.
            eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil_json_extension
            os.remove(eliminar)

    # Delete *.ok files in finished folder
    for fil in ok_spiders:
        eliminar = darknetsettings.PATH_FINISHED_SPIDERS + fil
        os.remove(eliminar)
        logging.debug("Deleting OK file %s", fil)

    logging.debug("Ending to process OK spiders")
Esempio n. 15
0
    def run(self):

        try:
            with db_session:
                # Get next site
                # darksite = self._sites_to_discover.pop()

                logging.debug("Trying to discover site %s", self._darksite)

                proc_log = dbutils.get_processing_logs_by_site_status(
                    s_url=self._darksite,
                    s_status=dbsettings.Status.DISCOVERING)
                # Computes the time spent from the first discovering status of the darksite
                time_spent = datetime.now() - proc_log[0].timestamp
                # Do I reach schedule time to start the discovering process?
                time_to_next_try = proc_log[
                    -1].next_time_to_try - datetime.now()

                discovering_tries = dbutils.get_site(
                    s_url=self._darksite).discovering_tries

                logging.debug("Time spent for site %s: %s ", self._darksite,
                              time_spent)
                logging.debug(
                    "Next discovering try for site %s: %s. Time to it: %s",
                    self._darksite, proc_log[-1].next_time_to_try,
                    time_to_next_try)
                logging.debug("Current tries for site %s: %s ", self._darksite,
                              discovering_tries)

                if time_to_next_try <= timedelta(minutes=0):
                    # Checking maximum discovering tries and period of time for trying
                    if discovering_tries < self._max_tries \
                            and time_spent <= timedelta(minutes=self._duration):

                        darksite_http = "http://" + self._darksite
                        logging.debug("DISCOVERING: %s", self._darksite)
                        if not connection_settings.PROXY:
                            response = request_conn.connectThroughProxy(
                                darksite_http,
                                proxies=None,
                                timeout=self._http_request_timeout)
                        else:
                            response = request_conn.connectThroughProxy(
                                darksite_http,
                                proxies={
                                    'http':
                                    'http://' + connection_settings.PROXY
                                },
                                timeout=self._http_request_timeout)

                        response_code = str(response.status_code)
                        response_time = str(response.elapsed.total_seconds())
                        # Print CSV Line
                        csv_line = ""
                        csv_line += self._darksite + "|" + response_code + "|"
                        csv_line += response_time + "|" + str(
                            discovering_tries)
                        logging.debug("RESPONSE: %s", csv_line)

                        logging.debug(
                            "Increasing discovering tries to site %s.",
                            self._darksite)
                        dbutils.increase_tries_on_discovering(
                            s_url=self._darksite)

                        # HTTP 2XX or 3XX
                        if reg_http.match(response_code):
                            dbutils.set_site_current_processing_status(
                                s_url=self._darksite,
                                s_http_status=response_code,
                                s_http_response_time=response_time,
                                s_status=dbsettings.Status.PENDING)
                            logging.debug("Site %s was set up to PENDING.",
                                          self._darksite)
                        # HTTP 4XX or 5XX
                        else:
                            dbutils.set_site_current_processing_status(
                                s_url=self._darksite,
                                s_http_status=response_code,
                                s_http_response_time=response_time,
                                s_status=dbsettings.Status.DISCOVERING)
                            logging.debug(
                                "Site %s was set up to DISCOVERING the response code %s received.",
                                self._darksite, response_code)
                    else:
                        dbutils.set_site_current_processing_status(
                            s_url=self._darksite,
                            s_status=dbsettings.Status.DISCARDED)
                        logging.debug(
                            "Site %s was set up to DISCARDED because tries were %s (max %s) "
                            "or duration was %s (max %s mins).",
                            self._darksite, discovering_tries, self._max_tries,
                            time_spent, self._duration)
                else:
                    logging.debug(
                        "Site %s is not ready to be discover until %s. Time to it: %s",
                        self._darksite, proc_log[0].next_time_to_try,
                        time_to_next_try)

        except Exception as e:
            logging.error("ERROR on discovering %s: %s", self._darksite, e)
            logging.exception("ERROR:")
            logging.debug("Increasing discovering tries to site %s.",
                          self._darksite)
            with db_session:
                dbutils.increase_tries_on_discovering(s_url=self._darksite)
                dbutils.set_site_current_processing_status(
                    s_url=self._darksite,
                    s_http_status='',
                    s_http_response_time='',
                    s_status=dbsettings.Status.DISCOVERING)
            logging.debug(
                "Site %s was set up to DISCOVERING because there was an HTTP error.",
                self._darksite)
Esempio n. 16
0
def main():
    '''
    EN: It controls the whole process of the crawling through a loop that is repeated every 2 seconds.
    SP: Controla todo el proceso del crawling mediante un bucle que se repite cada 2 segundos.

    Every second it enters the main loop (if there are still sites to visit or sites that are been visited) to crawl all the sites.
    Finally, the extracted info is added to the database and the json file that will be used for web visualitation of the node map is generated.
    Cada segundo se entra en el bucle principal (si quedan sitios por visitar o se están visitando) para crawlear todos los sites.
    Finalmente, la información extraída se añade a la base de datos y se genera el archivo json que se utilizará para la visulación web del mapa de nodos.
    '''

    log = logging.getLogger('')
    log.setLevel(logging.DEBUG)
    format = logging.Formatter('%(asctime)s %(levelname)s - %(threadName)s - mod: %(module)s, method: %(funcName)s, msg: %(message)s')

    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(format)
    log.addHandler(ch)

    fh = RotatingFileHandler("registro.log", maxBytes=0, backupCount=0) # NO rotation, neither by size, nor by number of files
    fh.setFormatter(format)
    log.addHandler(fh)

    logging.debug("Dentro de main()")

    # run_spider("stats.i2p")
    # time.sleep(60)
    # exit()

    # Gets initial seeds
    seed_sites = siteutils.get_initial_seeds("../../data/seed_urls.txt")

    # Create all sites with PENDING status. Note that if the site exists, it will not be created
    with db_session:
        for site in seed_sites:
            # is it a new site? Create it and set up the status to pending.
            if dbutils.create_site(s_url=site):
                dbutils.set_site_current_processing_status(s_url=site,s_status=settings.Status.PENDING)

    # Restore processing status
    with db_session:
        # Restore previous crawling process status
        # restored pending sites
        pending_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.PENDING)
        # restored ongoing sites
        ongoing_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ONGOING)
        # restored error sites
        error_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ERROR)

    logging.debug("Restoring %s ERROR sites.", len(error_sites))

    # Error sites should be tagged as pending sites.
    for site in error_sites:
        if site not in pending_sites: # they should not be in PENDING status;)
            with db_session:
                if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries:
                    logging.debug("The site %s has been restored. New status PENDING.", site)
                    pending_sites.insert(0, site)
                    # sets up the error site to pending status
                    dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.PENDING)
                else:
                    logging.debug("The site %s cannot be crawled because the number of max_tries has been reached.", site)
                    # The site cannot be crawled
                    dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN)

    logging.debug("Restoring %s PENDING sites.", len(pending_sites))
    logging.debug("Restoring %s ONGOING sites.", len(ongoing_sites))

    # restored ONGOING SITES should be launched
    for site in ongoing_sites:
        if len(ongoing_sites) <= max_ongoing_spiders:
            logging.debug("Starting spider for %s.", site)
            run_spider(site)

    # Monitoring time
    stime = time.time()
    etime = time.time()

    # main loop
    while pending_sites or ongoing_sites:

        # Try to run another site
        if len(ongoing_sites) <= max_ongoing_spiders:
            if pending_sites:
                with db_session:
                    site = pending_sites.pop()
                    if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries:
                        logging.debug("Starting spider for %s.", site)
                        run_spider(site)
                    else:
                        logging.debug("The site %s cannot be crawled.", site)
                        # The site cannot be crawled
                        dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN)

        # Polling spiders status
        check()

        time.sleep(1)
        if (etime - stime) < 60:
            etime = time.time()
        else:
            stime = time.time()
            etime = time.time()

        # Update the status
        with db_session:
            pending_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.PENDING)
            ongoing_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ONGOING)
            error_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.ERROR)
            unknown_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.UNKNOWN)
            finished_sites = dbutils.get_sites_by_processing_status(s_status=settings.Status.FINISHED)

        # Error sites should be tagged as pending sites.
        for site in error_sites:
            if site not in pending_sites:
                with db_session:
                    if dbutils.get_site(s_url=site).crawling_tries <= max_crawling_tries:
                        logging.debug("The site %s has been restored. New status PENDING.", site)
                        pending_sites.insert(0, site)
                        # sets up the error site to pending status
                        dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.PENDING)
                    else:
                        logging.debug("The site %s cannot be crawled because the number of max_tries has been reached.",
                                      site)
                        # The site cannot be crawled
                        dbutils.set_site_current_processing_status(s_url=site, s_status=settings.Status.UNKNOWN)

        logging.debug("Stats --> ONGOING %s, PENDING %s, FINISHED %s, ERROR %s, UNKNOWN %s", \
                      len(ongoing_sites), len(pending_sites), len(finished_sites), len(error_sites),
                      len(unknown_sites))