def update_virtualauthor_record(va_id, tag, value): ''' Change the value associated to the given tag for a certain virtual author @param va_id: ID of the virtual author @type va_id: int @param tag: tag to be updated @type tag: string @param value: value to be written for the tag @type value: string ''' current_tag_value = [row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag))] if len(current_tag_value) > 0: for tagupdate in [row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag))]: tagupdate['tag'] = tag tagupdate['value'] = value dat.update_log("touched_vas", va_id) else: add_virtualauthor_record(va_id, tag, value)
def delete_virtual_author(va_id): ''' This will delete a virtual author while cascading the change through the different storages and instances: ra_data, ras, va_data and vas @param va_id: the virtual author to be deleted @type va_id: int @return: success or failure of the process @rtype: boolean ''' if not isinstance(va_id, int): try: va_id = int(va_id) except (ValueError, TypeError): raise ValueError("Expecting the va id to be an int.") tags = get_virtualauthor_record_tags() for tag in tags: delete_virtualauthor_record(va_id, tag) for deletion_candidate in list([row for row in dat.VIRTUALAUTHORS if row['virtualauthorid'] == va_id]): dat.VIRTUALAUTHORS.remove(deletion_candidate) dat.update_log("deleted_vas", va_id) return True
def update_virtualauthor_record(va_id, tag, value): ''' Change the value associated to the given tag for a certain virtual author @param va_id: ID of the virtual author @type va_id: int @param tag: tag to be updated @type tag: string @param value: value to be written for the tag @type value: string ''' current_tag_value = [ row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag)) ] if len(current_tag_value) > 0: for tagupdate in [ row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag)) ]: tagupdate['tag'] = tag tagupdate['value'] = value dat.update_log("touched_vas", va_id) else: add_virtualauthor_record(va_id, tag, value)
def delete_virtual_author(va_id): ''' This will delete a virtual author while cascading the change through the different storages and instances: ra_data, ras, va_data and vas @param va_id: the virtual author to be deleted @type va_id: int @return: success or failure of the process @rtype: boolean ''' if not isinstance(va_id, int): try: va_id = int(va_id) except (ValueError, TypeError): raise ValueError("Expecting the va id to be an int.") tags = get_virtualauthor_record_tags() for tag in tags: delete_virtualauthor_record(va_id, tag) for deletion_candidate in list( [row for row in dat.VIRTUALAUTHORS if row['virtualauthorid'] == va_id]): dat.VIRTUALAUTHORS.remove(deletion_candidate) dat.update_log("deleted_vas", va_id) return True
def delete_virtualauthor_record(va_id, tag): ''' Remove a tag field from the virtualauthor_data table @param va_id: ID of the virtual author @type va_id: int @param tag: tag of the record to be deleted @type tag: string ''' for tagupdate in list([row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag))]): dat.VIRTUALAUTHOR_DATA.remove(tagupdate) dat.update_log("touched_vas", va_id)
def delete_virtualauthor_record(va_id, tag): ''' Remove a tag field from the virtualauthor_data table @param va_id: ID of the virtual author @type va_id: int @param tag: tag of the record to be deleted @type tag: string ''' for tagupdate in list([ row for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] == va_id) and (row['tag'] == tag)) ]): dat.VIRTUALAUTHOR_DATA.remove(tagupdate) dat.update_log("touched_vas", va_id)
def create_new_realauthor(va_id): """ Create a new real author connected to the given virtual author; the trust value for the virtual author is obviously 1 because this virtual author is incompatible with every other real author and is the first one in this new real author. RETURNS: the newly created realauthorid """ current_ra_id = dat.increment_tracker("raid_counter") dat.REALAUTHORS.append({"realauthorid": current_ra_id, "virtualauthorid": va_id, "p": 1}) dat.update_log("new_ras", current_ra_id) update_realauthor_data_by_vid(current_ra_id, va_id) # update_realauthor_names(current_ra_id) return current_ra_id
def create_new_realauthor(va_id): """ Create a new real author connected to the given virtual author; the trust value for the virtual author is obviously 1 because this virtual author is incompatible with every other real author and is the first one in this new real author. RETURNS: the newly created realauthorid """ current_ra_id = dat.increment_tracker("raid_counter") dat.REALAUTHORS.append({'realauthorid': current_ra_id, 'virtualauthorid': va_id, 'p': 1}) dat.update_log("new_ras", current_ra_id) update_realauthor_data_by_vid(current_ra_id, va_id) # update_realauthor_names(current_ra_id) return current_ra_id
def add_virtualauthor_record(va_id, tag, value): ''' Adds a record to the virtualauthor_data table @param va_id: id of the virtual author to attach the attribute to @type va_id: int @param tag: tag to alter the value of @type tag: string @param value: the new value of the tag @type value: string ''' dat.VIRTUALAUTHOR_DATA.append({'virtualauthorid': va_id, 'tag': tag, 'value': value}) if not tag == "updated": update_virtualauthor_record(va_id, 'updated', 'True') dat.update_log("touched_vas", va_id)
def add_virtualauthor_record(va_id, tag, value): ''' Adds a record to the virtualauthor_data table @param va_id: id of the virtual author to attach the attribute to @type va_id: int @param tag: tag to alter the value of @type tag: string @param value: the new value of the tag @type value: string ''' dat.VIRTUALAUTHOR_DATA.append({ 'virtualauthorid': va_id, 'tag': tag, 'value': value }) if not tag == "updated": update_virtualauthor_record(va_id, 'updated', 'True') dat.update_log("touched_vas", va_id)
def add_minimum_virtualauthor(orig_authornames_id, orig_name_string, bibrec_id, author_index, authorname_p_list, refrec=""): ''' Adds a complete virtual author to the virtual authors table. @param orig_authornames_id: ID of the name in authornames table @type orig_authornames_id: int @param orig_name_string: String of the author name @type orig_name_string: string @param bibrec_id: ID of the record @type bibrec_id: int @param author_index: number of the author @type author_index: int @param authorname_p_list: list of authornamesID with confidence index associated [[id1,p], [id2,p], .., [idn,p]] @type authorname_p_list: list of lists @param refrec: The bibref-bibrec pair of this author on a paper in format "100:14424,12441" @type refrec: string ''' if authorname_p_list: pass current_va = create_new_virtualauthor(orig_authornames_id, orig_name_string) add_virtualauthor_record(current_va, 'bibrec_id', bibrec_id) add_virtualauthor_record(current_va, 'author_index', author_index) if refrec: pair = "%s,%s" % (refrec, bibrec_id) add_virtualauthor_record(current_va, 'bibrefrecpair', pair) dat.update_log("new_vas", current_va) update_virtualauthor_cluster(current_va)
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len( [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log( 25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [ row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids ]: for author_id in docs['authornameids']: author_name = [ an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id ] refrecs = [ ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id ] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [ row for row in authors if row['db_name'] == rec_author ] if author_in_list: for upd in [ row for row in authors if row['db_name'] == rec_author ]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({ 'db_name': rec_author, 'records': [rec], 'last_name': last_name }) for status, author_last_name in enumerate(author_last_names): current_authors = [ row for row in authors if row['last_name'] == author_last_name ] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log( 25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [ row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name'] ] if not authornamesid: bconfig.LOGGER.error( "The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records( va_id, "orig_authorname_id") for an_list in [ row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec ]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log( 25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id, )) bconfig.LOGGER.log( 25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log( 25, "Done updating everything. Thanks for flying " "with bibauthorid!")