def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records,),
                          stream=sys.stdout, verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names."
                          % (total), stream=sys.stdout, verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir,)

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                    stream=sys.stdout, verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del(lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.'
                                     % (status, total, lname))
        bibtask.write_message(("Processing: %s (%d/%d).")
                                    % (lname, status, total),
                                    stream=sys.stdout, verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [row['name'] for row in dat.AUTHOR_NAMES
                     if not row['processed']]:
            potential_removal = "%s," % (name.split(',')[0],)

            if not potential_removal == "%s" % (lname,):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(("-> Removed %s entries from the "
                                    + "computation list: %s")
                                    % (removed, removed_names),
                                    stream=sys.stdout, verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def _prepare_data_files_from_db(data_dir_name="grid_data",
                                workdir_prefix="job",
                                max_records=4000):
    '''
    Prepares grid jobs. Is a task running in bibsched.
    Meaning:
        1. Find all last names in the database
        2. For each last name:
            - find all documents regarding this last name (ignore first names)
            - if number of documents loaded into memory exceeds max_records,
              write the memory cache into files (cf. Files section).
              Each write back procedure will happen into a newly created
              directory. The prefix for the respective job directory may
              be specified as well as the name of the data directory where
              these job directories will be created.
    Files:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param data_dir_name: the name of the directory that will hold all the
        sub directories for the jobs.
    @type data_dir_name: string
    @param workdir_prefix: prefix for the job sub directories.
    @type workdir_prefix: string
    @param max_records: maximum number of records after which the memory
        cache is to be flushed to files.
    @type max_records: int
    '''
    try:
        max_records = int(max_records)
    except ValueError:
        max_records = 4000

    bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0)
    bibtask.write_message("Limiting files to %s records" % (max_records, ),
                          stream=sys.stdout,
                          verbose=0)
    bibtask.task_update_progress('Loading last names...')

    last_names = find_all_last_names()
    last_name_queue = Queue.Queue()

    for last_name in sorted(last_names):
        last_name_queue.put(last_name)

    total = len(last_names)
    status = 1
    bibtask.write_message("Done. Loaded %s last names." % (total),
                          stream=sys.stdout,
                          verbose=0)
    job_id = 0
    data_dir = ""

    if data_dir_name.startswith("/"):
        data_dir = data_dir_name
    else:
        data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name)

    if not data_dir.endswith("/"):
        data_dir = "%s/" % (data_dir, )

    job_lnames = []

    while True:
        if last_name_queue.empty():
            bibtask.write_message("Done with all names.",
                                  stream=sys.stdout,
                                  verbose=0)
            break

        lname_list = last_name_queue.get()
        lname = None

        if lname_list:
            lname = lname_list[0]
            del (lname_list[0])
        else:
            bconfig.LOGGER.warning("Got an empty Queue element. "
                                   "Queue seems corrupted.")
            continue

        job_lnames.append(lname)
        bibtask.task_update_progress('Preparing job %d of %d: %s.' %
                                     (status, total, lname))
        bibtask.write_message(
            ("Processing: %s (%d/%d).") % (lname, status, total),
            stream=sys.stdout,
            verbose=0)

        populate_doclist_for_author_surname(lname)

        post_remove_names = set()

        for name in [
                row['name'] for row in dat.AUTHOR_NAMES if not row['processed']
        ]:
            potential_removal = "%s," % (name.split(',')[0], )

            if not potential_removal == "%s" % (lname, ):
                post_remove_names.add(potential_removal)

        if len(post_remove_names) > 1:
            removed = 0
            removed_names = []

            for post_remove_name in post_remove_names:
                if post_remove_name in lname_list:
                    lname_list.remove(post_remove_name)
                    removed_names.append(post_remove_name)
                    removed += 1

            bibtask.write_message(
                ("-> Removed %s entries from the " + "computation list: %s") %
                (removed, removed_names),
                stream=sys.stdout,
                verbose=0)
            total -= removed

        if lname_list:
            last_name_queue.put(lname_list)

        if len(dat.RELEVANT_RECORDS) >= max_records:
            if not os.path.exists(data_dir):
                os.mkdir(data_dir)

            work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

            _write_to_files(work_dir, job_lnames)
            job_lnames = []
            job_id += 1

        status += 1

    if dat.RELEVANT_RECORDS:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)

        work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id)

        _write_to_files(work_dir, job_lnames)

    return True
def start_full_disambiguation(last_names="all",
                         process_orphans=False,
                         db_exists=False,
                         populate_doclist=True,
                         write_to_db=True,
                         populate_aid_from_personid=False):
    '''
    Starts the disambiguation process on a specified set of authors or on all
    authors while respecting specified preconditions

    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    @param populate_aid_from_personid: Populate all AID tables as a backtrack
        from aidPERSONID?
    @type populate_aid_from_personid: boolean

    @return: True if the process went through smoothly, False if it didn't
    @rtype: boolean
    '''

    if bconfig.STANDALONE:
        bconfig.LOGGER.critical("This method is not available in "
                                "standalone mode.")
        return False

    if isinstance(last_names, str) and last_names != "all":
        job_last_names = [last_names]

    elif last_names == "all":
        job_last_names = find_all_last_names()

    elif isinstance(last_names, list):
        job_last_names = last_names

    else:
        bconfig.LOGGER.error("Failed to detect parameter type. Exiting.")
        return False

    if populate_aid_from_personid:
        dat.RUNTIME_CONFIG['populate_aid_from_personid'] = True

    if db_exists:
        db_lnames = get_existing_last_names()

        for db_lname in db_lnames:
            if db_lname in job_last_names:
                job_last_names.remove(db_lname)

        bconfig.LOGGER.log(25, "Removed %s entries from the computation list, "
                           " since they've already been processed and written "
                           " to the db"
                      % (len(db_lnames)))

        del(db_lnames)

    totale = len(job_last_names)

    if MP_ENABLED:
        mp_queue = multiprocessing.Queue()
        mp_termination_queue = multiprocessing.Queue(1)
        db_write_lock = multiprocessing.Lock()

        process_list = []

        lc = multiprocessing.Process(target=list_creation_process,
                                     name='baid-listgen',
                                     args=(mp_queue, job_last_names,
                                           mp_termination_queue))
        lc.start()
        del(job_last_names)

        for i in range(bconfig.BIBAUTHORID_MAX_PROCESSES):
            p = multiprocessing.Process(target=computation_process_starter,
                                        name='baid-worker-' + str(i),
                                        args=(i, mp_termination_queue,
                                              mp_queue, db_write_lock,
                                              populate_doclist, True,
                                              process_orphans, True,
                                              write_to_db))
            process_list.append(p)
            p.start()
        for p in process_list:
            p.join()

        lc.join()
    else:
        mp_queue = Queue.Queue()
        mp_termination_queue = Queue.Queue()
        db_write_lock = None
        list_creation_process(mp_queue, job_last_names, mp_termination_queue)
        del(job_last_names)
        computation_process_starter(0, mp_termination_queue, mp_queue,
                                    db_write_lock, populate_doclist, True,
                                    process_orphans, True, write_to_db)

#    status = 1
    bconfig.LOGGER.log(25, "Done. Loaded %s last names." % (totale))
Example #4
0
def start_full_disambiguation(last_names="all",
                              process_orphans=False,
                              db_exists=False,
                              populate_doclist=True,
                              write_to_db=True):
    '''
    Starts the disambiguation process on a specified set of authors or on all
    authors while respecting specified preconditions

    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean

    @return: True if the process went through smoothly, False if it didn't
    @rtype: boolean
    '''

    if bconfig.STANDALONE:
        bconfig.LOGGER.critical("This method is not available in "
                                "standalone mode.")
        return False

    if isinstance(last_names, str) and last_names != "all":
        job_last_names = [last_names]

    elif last_names == "all":
        job_last_names = find_all_last_names()

    elif isinstance(last_names, list):
        job_last_names = last_names

    else:
        bconfig.LOGGER.error("Failed to detect parameter type. Exiting.")
        return False

    if db_exists:
        db_lnames = get_existing_last_names()

        for db_lname in db_lnames:
            if db_lname in job_last_names:
                job_last_names.remove(db_lname)

        bconfig.LOGGER.log(
            25, "Removed %s entries from the computation list, "
            " since they've already been processed and written "
            " to the db" % (len(db_lnames)))

        del (db_lnames)

    totale = len(job_last_names)

    if MP_ENABLED:
        mp_queue = multiprocessing.Queue()
        mp_termination_queue = multiprocessing.Queue(1)
        mp_queue.put([])
        db_write_lock = multiprocessing.Lock()

        process_list = []

        lc = multiprocessing.Process(target=list_creation_process,
                                     name='baid-listgen',
                                     args=(mp_queue, job_last_names,
                                           mp_termination_queue))
        lc.start()
        del (job_last_names)

        for i in range(bconfig.BIBAUTHORID_MAX_PROCESSES):
            p = multiprocessing.Process(
                target=computation_process_starter,
                name='baid-worker-' + str(i),
                args=(i, mp_termination_queue, mp_queue, db_write_lock,
                      populate_doclist, True, process_orphans, True,
                      write_to_db))
            process_list.append(p)
            p.start()
        for p in process_list:
            p.join()

        lc.join()
    else:
        mp_queue = Queue.Queue()
        mp_termination_queue = Queue.Queue()
        mp_queue.put([])
        db_write_lock = None
        list_creation_process(mp_queue, job_last_names, mp_termination_queue)
        del (job_last_names)
        computation_process_starter(0, mp_termination_queue, mp_queue,
                                    db_write_lock, populate_doclist, True,
                                    process_orphans, True, write_to_db)


#    status = 1
    bconfig.LOGGER.log(25, "Done. Loaded %s last names." % (totale))