def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message("Specified Data directory is not a directory. "
                              "Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                              stream=sys.stdout, verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir,)

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s"
                                  % (results_dir,), stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s"
                                  % (job_dir,), stream=sys.stdout, verbose=0)
            continue

        correct_files = set(['realauthors.dat',
                             'ids.dat',
                             'virtual_author_clusters.dat',
                             'virtual_authors.dat',
                             'doclist.dat',
                             'virtual_author_data.dat',
                             'authornames.dat',
                             'virtual_author_cluster_cache.dat',
                             'realauthor_data.dat',
                             'ra_va_cache.dat']
                            )
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s'
                                     % (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True) and
            write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.",
                                  stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message("Could not write data to the tables from %s"
                                  % (results_dir,),
                                  stream=sys.stdout, verbose=0)
def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout,
                              verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message(
            "Specified Data directory is not a directory. "
            "Task failed.",
            stream=sys.stdout,
            verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir, )

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s" %
                                  (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s" %
                                  (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        correct_files = set([
            'realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat',
            'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat',
            'authornames.dat', 'virtual_author_cluster_cache.dat',
            'realauthor_data.dat', 'ra_va_cache.dat'
        ])
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s' %
                                     (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True)
                and write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.", stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message(
                "Could not write data to the tables from %s" % (results_dir, ),
                stream=sys.stdout,
                verbose=0)
def computation_process_starter(i, mp_termination_queue, job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''
    while True:
        debugmsg(i, "getting name from queue")
        if job_mp_queue.qsize() > 0:
            job_last_names = job_mp_queue.get()
            debugmsg(i, "got queue item! %s items left in queue"
                        % job_mp_queue.qsize())

        else:
            debugmsg(i, "Queue is currently empty...")
            if not mp_termination_queue.empty():
                debugmsg(i, "Exit token there, Process %s salutes to quit!" % i)
                return
            else:
                debugmsg(i, "Exit token not present, continuing in 15s!")
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            debugmsg(i, "starting with queue: " + str(last_name_queue.queue))

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del(lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [row['name'] for row in dat.AUTHOR_NAMES
                         if not row['processed']]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(25, "-> Removed %s entries from the "
                                        "computation list: %s"
                                        % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                         "Nothing will be written to the "
                                         "database from this job. That's ok, "
                                         "when excluding collections. Last "
                                         "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()
Ejemplo n.º 4
0
def computation_process_starter(i,
                                mp_termination_queue,
                                job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''

    while True:
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(
                i) + ': getting name from queue'
        job_last_names_list = job_mp_queue.get()
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue'
        if len(job_last_names_list) > 0:
            job_last_names = job_last_names_list[0]

            if len(job_last_names_list) > 1:
                job_mp_queue.put(job_last_names_list[1:])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put non empty list'
            else:
                job_mp_queue.put([])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put empty list'

        else:
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': we got an empty list...'
            job_mp_queue.put([])
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': put empty list'
            if not mp_termination_queue.empty():
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token there, exiting!'
                return
            else:
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token not there, continuing!'
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))
        del (job_last_names_list)

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': starting with queue: ' + str(last_name_queue.queue)

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del (lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [
                    row['name'] for row in dat.AUTHOR_NAMES
                    if not row['processed']
            ]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(
                    25, "-> Removed %s entries from the "
                    "computation list: %s" % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                        "Nothing will be written to the "
                                        "database from this job. That's ok, "
                                        "when excluding collections. Last "
                                        "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()