def nameset_comp(c, lname, mp_variation_set):
    try:
        debugmsg(c, "List_creator: %s" % str(c)
                 + "  working on " + str(lname.encode('UTF-8')))
    except:
        debugmsg(c, "List_creator: %s" % str(c) + "  working on "
                 + str('Error encoding name'))

#    old_variations_set = mp_variaton_set.get()
#    mp_variaton_set.put(old_variations_set)
#    if lname in old_variations_set:
#        if bconfig.TABLES_UTILS_DEBUG:
#            print time.strftime('%H:%M:%S') + ' ' + "List_creator: %s" % str(c) + "  computation DROPPED."
#        return

    dat.reset_mem_cache(True)
    init_authornames(lname)
    nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES])


    debugmsg(c, "List_creator: %s" % str(c) + "  computation finished, pushing varset")

    mp_variation_set.put([lname, list(nameset)])

    debugmsg(c, "List_creator: %s" % str(c) + " current variations pushed")
Ejemplo n.º 2
0
def list_creation_process(mp_queue, job_last_names, mp_termination_queue):
    '''
    Sub process to build the pre-clustered last name blocks

    @param mp_queue: queue holding the last name blocks
    @type mp_queue: queue
    @param job_last_names: list of all last names in the db
    @type job_last_names: list of string
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    '''
    #job_last_names = sorted(job_last_names, key=lambda k: len(k))
    variations_set = set()
    jl = []

    for lname in job_last_names:
        if lname in variations_set:
            continue

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime(
                '%H:%M:%S') + ' ' + "List_creator: working on " + str(
                    lname.encode('UTF-8'))

        dat.reset_mem_cache(True)
        init_authornames(lname)
        nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES])

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime(
                '%H:%M:%S'
            ) + ' ' + "List_creator: computation finished, getting queue"

        jl[:] = mp_queue.get()

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime(
                '%H:%M:%S') + ' ' + "List_creator: appending " + str(
                    nameset) + ' with still ' + str(
                        len(jl)) + ' elements in queue'

        jl.append(list(nameset))
        mp_queue.put(jl)

        for n in nameset:
            variations_set.add(n)

    if bconfig.TABLES_UTILS_DEBUG:
        print time.strftime(
            '%H:%M:%S') + ' ' + "List_creator: putting exit token"

    mp_termination_queue.put(True)
    return
Ejemplo n.º 3
0
def list_creation_process(mp_queue, job_last_names, mp_termination_queue):
    '''
    Sub process to build the pre-clustered last name blocks

    @param mp_queue: queue holding the last name blocks
    @type mp_queue: queue
    @param job_last_names: list of all last names in the db
    @type job_last_names: list of string
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    '''
    #job_last_names = sorted(job_last_names, key=lambda k: len(k))
    variations_set = set()
    jl = []

    for lname in job_last_names:
        if lname in variations_set:
            continue

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + "List_creator: working on " + str(lname.encode('UTF-8'))

        dat.reset_mem_cache(True)
        init_authornames(lname)
        nameset = set([x['name'].split(",")[0] for x in dat.AUTHOR_NAMES])

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + "List_creator: computation finished, getting queue"

        jl[:] = mp_queue.get()

        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + "List_creator: appending " + str(nameset) + ' with still ' + str(len(jl)) + ' elements in queue'

        jl.append(list(nameset))
        mp_queue.put(jl)

        for n in nameset:
            variations_set.add(n)

    if bconfig.TABLES_UTILS_DEBUG:
        print time.strftime('%H:%M:%S') + ' ' + "List_creator: putting exit token"

    mp_termination_queue.put(True)
    return
def _write_to_files(work_dir, job_lnames):
    '''
    Wrapper function around this internal write process.
    Triggers the write-back to the files to the mem cache.

    @param work_dir: where shall the files be stored?
    @type work_dir: string
    @param job_lnames: list of names
    @type job_lnames: list
    '''
    bibtask.task_update_progress('Writing to files in %s' % (work_dir))
    bibtask.write_message("Writing cluster with %s entries to "
                          "files in %s"
                          % (len(dat.RELEVANT_RECORDS), work_dir,),
                            stream=sys.stdout, verbose=0)

    if not os.path.exists(work_dir):
        os.mkdir(work_dir)

    write_mem_cache_to_files(work_dir, job_lnames)
    dat.reset_mem_cache(True)
def _write_to_files(work_dir, job_lnames):
    '''
    Wrapper function around this internal write process.
    Triggers the write-back to the files to the mem cache.

    @param work_dir: where shall the files be stored?
    @type work_dir: string
    @param job_lnames: list of names
    @type job_lnames: list
    '''
    bibtask.task_update_progress('Writing to files in %s' % (work_dir))
    bibtask.write_message("Writing cluster with %s entries to "
                          "files in %s" % (
                              len(dat.RELEVANT_RECORDS),
                              work_dir,
                          ),
                          stream=sys.stdout,
                          verbose=0)

    if not os.path.exists(work_dir):
        os.mkdir(work_dir)

    write_mem_cache_to_files(work_dir, job_lnames)
    dat.reset_mem_cache(True)
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''

    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len([row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(25, "Creating minimal virtual authors for "
                                "all loaded docs (%s)"
                                % (num_docs))

        for docs in [row for row in dat.DOC_LIST
                     if row['bibrecid'] in bibrec_ids]:
            for author_id in docs['authornameids']:
                author_name = [an['name'] for an in dat.AUTHOR_NAMES
                               if an['id'] == author_id]
                refrecs = [ref[1] for ref in docs['authornameid_bibrefrec']
                           if ref[0] == author_id]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
                                                        date=last_log[0][2])
        insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status',
                    comment='bibauthorid_daemon, update_authorid_universe',
                    timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records."
                              % (len(recently_modified)), stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout, verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout, verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                                stream=sys.stdout, verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s"
                                     % (rec))
                continue

            author_in_list = [row for row in authors
                              if row['db_name'] == rec_author]

            if author_in_list:
                for upd in [row for row in authors
                            if row['db_name'] == rec_author]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({'db_name': rec_author,
                                'records': [rec],
                                'last_name': last_name})

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [row for row in authors
                           if row['last_name'] == author_last_name]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress('Processing %s of %s cluster: "%s" '
                                     '(%s authors)'
                                     % (status + 1, total_lnames,
                                        author_last_name, total_authors))
        bibtask.write_message('Processing %s of %s cluster: "%s" '
                              '(%s authors)'
                              % (status + 1, total_lnames, author_last_name,
                                 total_authors), stream=sys.stdout, verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory"
                               " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [row['id'] for row in dat.AUTHOR_NAMES
                             if row['db_name'] == current_author['db_name']]

            if not authornamesid:
                bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames "
                                     "and will be skipped. You might want "
                                     "to run authornames update before?"
                                     % (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped."
                                     % (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(va_id,
                                                        "orig_authorname_id")

                        for an_list in [row['authornameids'] for row in
                                    dat.DOC_LIST if row['bibrecid'] == rec]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(25, "-- Computation finished. Will write back to "
                               "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id,))

    bconfig.LOGGER.log(25, "Will now run personid update to make the "
                       "changes visible also on the front end and to "
                       "create person IDs for %s newly created and changed "
                       "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying "
                       "with bibauthorid!")
def computation_process_starter(i, mp_termination_queue, job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''
    while True:
        debugmsg(i, "getting name from queue")
        if job_mp_queue.qsize() > 0:
            job_last_names = job_mp_queue.get()
            debugmsg(i, "got queue item! %s items left in queue"
                        % job_mp_queue.qsize())

        else:
            debugmsg(i, "Queue is currently empty...")
            if not mp_termination_queue.empty():
                debugmsg(i, "Exit token there, Process %s salutes to quit!" % i)
                return
            else:
                debugmsg(i, "Exit token not present, continuing in 15s!")
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            debugmsg(i, "starting with queue: " + str(last_name_queue.queue))

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del(lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [row['name'] for row in dat.AUTHOR_NAMES
                         if not row['processed']]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(25, "-> Removed %s entries from the "
                                        "computation list: %s"
                                        % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                         "Nothing will be written to the "
                                         "database from this job. That's ok, "
                                         "when excluding collections. Last "
                                         "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()
def _update_authorid_universe():
    '''
    Updates all data related to the authorid algorithm.

    Sequence of operations:
        - Get all recently updated papers and remember time in the log
        - Get all authors on all papers
        - Extract collection of last names
        - For each last name:
            - Populate mem cache with cluster data
            - Delete updated records and their virtual authors from mem cache
            - Create virtual authors for new and updated records
            - Start matching algorithm
        - Update tables with results of the computation
        - Start personid update procedure
    '''
    def create_vas_from_specific_doclist(bibrec_ids):
        '''
        Processes the document list and creates a new minimal virtual author
        for each author in each record specified in the given list.

        @param bibrec_ids: Record IDs to concern in this update
        @type bibrec_ids: list of int
        '''
        num_docs = len(
            [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids])

        bconfig.LOGGER.log(
            25, "Creating minimal virtual authors for "
            "all loaded docs (%s)" % (num_docs))

        for docs in [
                row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids
        ]:
            for author_id in docs['authornameids']:
                author_name = [
                    an['name'] for an in dat.AUTHOR_NAMES
                    if an['id'] == author_id
                ]
                refrecs = [
                    ref[1] for ref in docs['authornameid_bibrefrec']
                    if ref[0] == author_id
                ]
                refrec = -1

                if len(refrecs) > 1:
                    print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!"
                    refrec = refrecs[0]
                elif refrecs:
                    refrec = refrecs[0]

                if refrec and author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [], refrec)
                elif author_name:
                    add_minimum_virtualauthor(author_id, author_name[0],
                                              docs['bibrecid'], 0, [])

    dat.reset_mem_cache(True)
    last_log = get_user_log(userinfo='daemon',
                            action='update_aid',
                            only_most_recent=True)
    updated_records = []

    if last_log:
        #select only the most recent papers
        recently_modified, last_update_time = get_papers_recently_modified(
            date=last_log[0][2])
        insert_user_log('daemon',
                        '-1',
                        'update_aid',
                        'bibsched',
                        'status',
                        comment='bibauthorid_daemon, update_authorid_universe',
                        timestamp=last_update_time[0][0])
        bibtask.write_message("Update authorid will operate on %s records." %
                              (len(recently_modified)),
                              stream=sys.stdout,
                              verbose=0)

        if not recently_modified:
            bibtask.write_message("Update authorid: Nothing to do",
                                  stream=sys.stdout,
                                  verbose=0)
            return

        for rec in recently_modified:
            updated_records.append(rec[0])
            dat.update_log("rec_updates", rec[0])

    else:
        bibtask.write_message("Update authorid: Nothing to do",
                              stream=sys.stdout,
                              verbose=0)
        return

    authors = []
    author_last_names = set()

    bibtask.task_update_progress('Reading authors from updated records')
    bibtask.write_message("Reading authors from updated records",
                          stream=sys.stdout,
                          verbose=0)
    updated_ras = set()

    # get all authors from all updated records
    for rec in updated_records:
        rec_authors = get_field_values_on_condition(rec, ['100', '700'],
                                                    "a",
                                                    source="API")

        for rec_author in rec_authors:
            if not rec_author:
                bconfig.LOGGER.error("Invalid empty author string, which "
                                     "will be skipped on record %s" % (rec))
                continue

            author_in_list = [
                row for row in authors if row['db_name'] == rec_author
            ]

            if author_in_list:
                for upd in [
                        row for row in authors if row['db_name'] == rec_author
                ]:
                    upd['records'].append(rec)
            else:
                last_name = split_name_parts(rec_author)[0]
                author_last_names.add(last_name)
                authors.append({
                    'db_name': rec_author,
                    'records': [rec],
                    'last_name': last_name
                })

    for status, author_last_name in enumerate(author_last_names):
        current_authors = [
            row for row in authors if row['last_name'] == author_last_name
        ]
        total_lnames = len(author_last_names)
        total_authors = len(current_authors)
        bibtask.task_update_progress(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors))
        bibtask.write_message(
            'Processing %s of %s cluster: "%s" '
            '(%s authors)' %
            (status + 1, total_lnames, author_last_name, total_authors),
            stream=sys.stdout,
            verbose=0)
        dat.reset_mem_cache(True)
        init_authornames(author_last_name)
        load_mem_cache_from_tables()
        bconfig.LOGGER.log(
            25, "-- Relevant data successfully read into memory"
            " to start processing")

        for current_author in current_authors:
            load_records_to_mem_cache(current_author['records'])
            authornamesid = [
                row['id'] for row in dat.AUTHOR_NAMES
                if row['db_name'] == current_author['db_name']
            ]

            if not authornamesid:
                bconfig.LOGGER.error(
                    "The author '%s' rec '%s' is not in authornames "
                    "and will be skipped. You might want "
                    "to run authornames update before?" %
                    (current_author['db_name'], rec))
                continue
            else:
                try:
                    authornamesid = int(authornamesid[0])
                except (IndexError, TypeError, ValueError):
                    bconfig.LOGGER.error("Invalid authornames ID!")
                    continue

            if not current_author['records']:
                bconfig.LOGGER.error("The author '%s' is not associated to any"
                                     " document and will be skipped." %
                                     (current_author['db_name']))
                continue

            for rec in current_author['records']:
                # remove VAs already existing for the record
                va_ids = get_va_ids_by_recid_lname(rec,
                                                   current_author["last_name"])

                if va_ids:
                    for va_id in va_ids:
                        ra_list = get_realauthors_by_virtuala_id(va_id)

                        for ra_id in ra_list:
                            remove_va_from_ra(ra_id, va_id)
                            del_ra_data_by_vaid(ra_id, va_id)

                        va_anames_id = get_virtualauthor_records(
                            va_id, "orig_authorname_id")

                        for an_list in [
                                row['authornameids'] for row in dat.DOC_LIST
                                if row['bibrecid'] == rec
                        ]:
                            try:
                                an_list.remove(va_anames_id)
                            except (ValueError):
                                # This names id is not in the list...don't care
                                pass

                        delete_virtual_author(va_id)

                # create new VAs for the record.
                update_doclist(rec, authornamesid)
                dat.update_log("rec_updates", rec)

            create_vas_from_specific_doclist(current_author['records'])

        bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.")
        start_computation(process_doclist=False,
                          process_orphans=True,
                          print_stats=True)
        bconfig.LOGGER.log(
            25, "-- Computation finished. Will write back to "
            "the database now.")
        update_db_result = update_tables_from_mem_cache(return_ra_updates=True)

        if not update_db_result[0]:
            bconfig.LOGGER.log(25, "Writing to persistence layer failed.")
        else:
            if update_db_result[1]:
                for updated_ra in update_db_result[1]:
                    if updated_ra:
                        updated_ras.add(updated_ra[0])

            bconfig.LOGGER.log(25, "Done updating authorid universe.")

    personid_ra_format = []

    for ra_id in updated_ras:
        personid_ra_format.append((ra_id, ))

    bconfig.LOGGER.log(
        25, "Will now run personid update to make the "
        "changes visible also on the front end and to "
        "create person IDs for %s newly created and changed "
        "authors." % len(updated_ras))
    bibtask.task_update_progress('Updating persistent Person IDs')
    update_personID_from_algorithm(personid_ra_format)
    bconfig.LOGGER.log(
        25, "Done updating everything. Thanks for flying "
        "with bibauthorid!")
Ejemplo n.º 9
0
def computation_process_starter(i,
                                mp_termination_queue,
                                job_mp_queue,
                                db_write_lock,
                                populate_doclist=True,
                                process_doclist=True,
                                process_orphans=False,
                                print_stats=True,
                                write_to_db=False):
    '''
    Sub process that starts the disambiguation process on a specified
    set of authors.

    @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig)
    @type i: int
    @param mp_termination_queue: queue holding the exit token for the processes
        to terminate upon finishing all queue elements
    @type mp_termination_queue: queue
    @param job_mp_queue: queue holding the last name blocks
    @type job_mp_queue: queue
    @param db_write_lock: shilds the database from too many concurrent accesses
    @type db_write_lock: multiprocessing.Lock
    @param last_names: "all" to process all authors or a specific last name
    @type last_names: string
    @param process_orphans: process the orphans left after the first process?
    @type process_orphans: boolean
    @param db_exists: is there a data representation already in memory?
    @type db_exists: boolean
    @param populate_doclist: shall we populate the document list w/ the authors
    @type populate_doclist: boolean
    @param write_to_db: write the results back to the database?
    @type write_to_db: boolean
    '''

    while True:
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(
                i) + ': getting name from queue'
        job_last_names_list = job_mp_queue.get()
        if bconfig.TABLES_UTILS_DEBUG:
            print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue'
        if len(job_last_names_list) > 0:
            job_last_names = job_last_names_list[0]

            if len(job_last_names_list) > 1:
                job_mp_queue.put(job_last_names_list[1:])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put non empty list'
            else:
                job_mp_queue.put([])
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': put empty list'

        else:
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': we got an empty list...'
            job_mp_queue.put([])
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': put empty list'
            if not mp_termination_queue.empty():
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token there, exiting!'
                return
            else:
                if bconfig.TABLES_UTILS_DEBUG:
                    print time.strftime('%H:%M:%S') + ' ' + str(
                        i) + ': token not there, continuing!'
                time.sleep(15)
                continue

        last_name_queue = Queue.Queue()

        last_name_queue.put(sorted(job_last_names))
        del (job_last_names_list)

        gc.collect()

        while True:

            dat.reset_mem_cache(True)
            gc.collect()

            if last_name_queue.empty():
                bconfig.LOGGER.log(25, "Done with all names.")
                break
            if bconfig.TABLES_UTILS_DEBUG:
                print time.strftime('%H:%M:%S') + ' ' + str(
                    i) + ': starting with queue: ' + str(last_name_queue.queue)

            lname_list = last_name_queue.get()
            lname = None

            if lname_list:
                lname = lname_list[0]
                del (lname_list[0])
            else:
                bconfig.LOGGER.warning("Got an empty Queue element. "
                                       "Queue seems corrupted.")
                continue
    #        bconfig.LOGGER.log(25, "Processing: %s (%d/%d)."
    #                                % (lname, status, total))

            if populate_doclist:
                populate_doclist_for_author_surname(lname, job_last_names)

            start_computation(process_orphans=process_orphans,
                              process_doclist=process_doclist,
                              print_stats=print_stats)
            post_remove_names = set()

            # The following snippet finds additionally processed last names and
            # removes them from the processing queue. E.g. 't hooft and t'hooft
            for name in [
                    row['name'] for row in dat.AUTHOR_NAMES
                    if not row['processed']
            ]:
                potential_removal = "%s" % (name.split(',')[0])

                if not potential_removal == "%s" % (lname):
                    post_remove_names.add(potential_removal)

            if len(post_remove_names) > 0:
                removed = 0
                removed_names = []

                for post_remove_name in post_remove_names:
                    if post_remove_name in lname_list:
                        lname_list.remove(post_remove_name)
                        removed_names.append(post_remove_name)
                        removed += 1

                bconfig.LOGGER.log(
                    25, "-> Removed %s entries from the "
                    "computation list: %s" % (removed, removed_names))

            if lname_list:
                last_name_queue.put(lname_list)

            if write_to_db:
                if MP_ENABLED:
                    db_write_lock.acquire()

                if dat.ID_TRACKER:
                    try:
                        write_mem_cache_to_tables()
                    except Exception, emsg:
                        bconfig.LOGGER.error("An error occurred while writing "
                                             "to the db: %s" % emsg)
                else:
                    bconfig.LOGGER.info("The ID tracker appears to be empty. "
                                        "Nothing will be written to the "
                                        "database from this job. That's ok, "
                                        "when excluding collections. Last "
                                        "processed last name: %s" % lname)

                if MP_ENABLED:
                    db_write_lock.release()

                dat.reset_mem_cache(True)
                gc.collect()
Ejemplo n.º 10
0
def populate_structs_from_files(work_dir, results=False):
    '''
    Reads the content of the files in 'work_dir' and tries to load the
    contained data in the respective memory cache. These files are created
    by the daemon's -G or --prepare-grid function.

    The files to be read are:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param work_dir: the directory to read the files from
    @type work_dir: string
    @return: Returns True if the process finished without error. If it fails,
        the program will exit with system error code 1
    @rtype: boolean
    '''
    if work_dir.endswith("/"):
        work_dir = work_dir[:-1]

    bconfig.LOGGER.log(25, "Reading files from %s to mem cache" % (work_dir, ))

    if not os.path.exists(work_dir):
        bconfig.LOGGER.critical("Job directory does not exist. Aborting.")
        raise IOError

    dat.reset_mem_cache(True)

    try:

        dfile = open("%s/authornames.dat" % (work_dir), "r")
        dat.AUTHOR_NAMES = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_authors.dat" % (work_dir), "r")
        dat.VIRTUALAUTHORS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_data.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_DATA = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_clusters.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_CLUSTERS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_cluster_cache.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_CLUSTER_CACHE = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/realauthors.dat" % (work_dir), "r")
        dat.REALAUTHORS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/realauthor_data.dat" % (work_dir), "r")
        dat.REALAUTHOR_DATA = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/doclist.dat" % (work_dir), "r")
        dat.DOC_LIST = loads(decompress(dfile.read()))
        dfile.close()

        if not results:
            dfile = open("%s/records.dat" % (work_dir), "r")
            dat.RELEVANT_RECORDS = loads(decompress(dfile.read()))
            dfile.close()

        dfile = open("%s/ids.dat" % (work_dir), "r")
        dat.ID_TRACKER = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/ra_va_cache.dat" % (work_dir), "r")
        dat.RA_VA_CACHE = loads(decompress(dfile.read()))
        dfile.close()

    except IOError, message:
        bconfig.LOGGER.exception("IOError while trying to read from file %s." %
                                 (message, ))
        raise Exception()
def populate_structs_from_files(work_dir, results=False):
    '''
    Reads the content of the files in 'work_dir' and tries to load the
    contained data in the respective memory cache. These files are created
    by the daemon's -G or --prepare-grid function.

    The files to be read are:
        - authornames.dat
        - virtual_authors.dat
        - virtual_author_data.dat
        - virtual_author_clusters.dat
        - virtual_author_cluster_cache.dat
        - realauthors.dat
        - realauthor_data.dat
        - doclist.dat
        - records.dat
        - ids.dat
        - ra_va_cache.dat

    @param work_dir: the directory to read the files from
    @type work_dir: string
    @return: Returns True if the process finished without error. If it fails,
        the program will exit with system error code 1
    @rtype: boolean
    '''
    if work_dir.endswith("/"):
        work_dir = work_dir[:-1]

    bconfig.LOGGER.log(25, "Reading files from %s to mem cache" % (work_dir,))

    if not os.path.exists(work_dir):
        bconfig.LOGGER.critical("Job directory does not exist. Aborting.")
        raise IOError

    dat.reset_mem_cache(True)

    try:

        dfile = open("%s/authornames.dat" % (work_dir), "r")
        dat.AUTHOR_NAMES = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_authors.dat" % (work_dir), "r")
        dat.VIRTUALAUTHORS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_data.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_DATA = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_clusters.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_CLUSTERS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/virtual_author_cluster_cache.dat" % (work_dir), "r")
        dat.VIRTUALAUTHOR_CLUSTER_CACHE = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/realauthors.dat" % (work_dir), "r")
        dat.REALAUTHORS = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/realauthor_data.dat" % (work_dir), "r")
        dat.REALAUTHOR_DATA = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/doclist.dat" % (work_dir), "r")
        dat.DOC_LIST = loads(decompress(dfile.read()))
        dfile.close()

        if not results:
            dfile = open("%s/records.dat" % (work_dir), "r")
            dat.RELEVANT_RECORDS = loads(decompress(dfile.read()))
            dfile.close()

        dfile = open("%s/ids.dat" % (work_dir), "r")
        dat.ID_TRACKER = loads(decompress(dfile.read()))
        dfile.close()

        dfile = open("%s/ra_va_cache.dat" % (work_dir), "r")
        dat.RA_VA_CACHE = loads(decompress(dfile.read()))
        dfile.close()

    except IOError, message:
        bconfig.LOGGER.exception("IOError while trying to read from file %s."
                      % (message,))
        raise Exception()