def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records,),
                          stream=sys.stdout, verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names."
                          % (total), stream=sys.stdout, verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir,)

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                    stream=sys.stdout, verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del(lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.'
                                     % (status, total, lname))
        bibtask.write_message(("Processing: %s (%d/%d).")
                                    % (lname, status, total),
                                    stream=sys.stdout, verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [row['name'] for row in dat.AUTHOR_NAMES
                     if not row['processed']]:
            potential_removal = "%s," % (name.split(',')[0],)

            if not potential_removal == "%s" % (lname,):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(("-> Removed %s entries from the "
                                    + "computation list: %s")
                                    % (removed, removed_names),
                                    stream=sys.stdout, verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records, ),
                          stream=sys.stdout,
                          verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names." % (total),
                          stream=sys.stdout,
                          verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir, )

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                  stream=sys.stdout,
                                  verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del (lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.' %
                                     (status, total, lname))
        bibtask.write_message(
            ("Processing: %s (%d/%d).") % (lname, status, total),
            stream=sys.stdout,
            verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [
                row['name'] for row in dat.AUTHOR_NAMES if not row['processed']
        ]:
            potential_removal = "%s," % (name.split(',')[0], )

            if not potential_removal == "%s" % (lname, ):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(
                ("-> Removed %s entries from the " + "computation list: %s") %
                (removed, removed_names),
                stream=sys.stdout,
                verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def computation_process_starter(i, mp_termination_queue, job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''
    while True:
        debugmsg(i, "getting name from queue")
        if job_mp_queue.qsize() > 0:
            job_last_names = job_mp_queue.get()
            debugmsg(i, "got queue item! %s items left in queue"
                        % job_mp_queue.qsize())

        else:
            debugmsg(i, "Queue is currently empty...")
            if not mp_termination_queue.empty():
                debugmsg(i, "Exit token there, Process %s salutes to quit!" % i)
                return
            else:
                debugmsg(i, "Exit token not present, continuing in 15s!")
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            debugmsg(i, "starting with queue: " + str(last_name_queue.queue))

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del(lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [row['name'] for row in dat.AUTHOR_NAMES
                         if not row['processed']]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(25, "-> Removed %s entries from the "
                                        "computation list: %s"
                                        % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                         "Nothing will be written to the "
                                         "database from this job. That's ok, "
                                         "when excluding collections. Last "
                                         "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()
Example #4
0
def computation_process_starter(i,
                                mp_termination_queue,
                                job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''

    while True:
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(
                i) + ': getting name from queue'
        job_last_names_list = job_mp_queue.get()
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue'
        if len(job_last_names_list) > 0:
            job_last_names = job_last_names_list[0]

            if len(job_last_names_list) > 1:
                job_mp_queue.put(job_last_names_list[1:])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put non empty list'
            else:
                job_mp_queue.put([])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put empty list'

        else:
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': we got an empty list...'
            job_mp_queue.put([])
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': put empty list'
            if not mp_termination_queue.empty():
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token there, exiting!'
                return
            else:
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token not there, continuing!'
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))
        del (job_last_names_list)

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': starting with queue: ' + str(last_name_queue.queue)

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del (lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [
                    row['name'] for row in dat.AUTHOR_NAMES
                    if not row['processed']
            ]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(
                    25, "-> Removed %s entries from the "
                    "computation list: %s" % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                        "Nothing will be written to the "
                                        "database from this job. That's ok, "
                                        "when excluding collections. Last "
                                        "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()