def find_and_process_updates(process_initials): """ Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean """ if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0]["value"] if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def remove_va_from_ra(ra_id, va_id): """ Removes a selected virtual author from a real author @param ra_id: id of the virtual author to be altered @type ra_id: int @param va_id: if of the virtual author to be removed from ra attachment @type va_id: int """ for remove in [ row for row in dat.REALAUTHORS if ((row["realauthorid"] == ra_id) and (row["virtualauthorid"] == va_id)) ]: dat.REALAUTHORS.remove(remove) bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "connected")
def remove_va_from_ra(ra_id, va_id): ''' Removes a selected virtual author from a real author @param ra_id: id of the virtual author to be altered @type ra_id: int @param va_id: if of the virtual author to be removed from ra attachment @type va_id: int ''' for remove in [row for row in dat.REALAUTHORS if ((row['realauthorid'] == ra_id) and (row['virtualauthorid'] == va_id))]: dat.REALAUTHORS.remove(remove) bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, 'connected')
def find_and_process_updates(process_initials): ''' Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean ''' if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = (bibauthorid_virtualauthor_utils.get_virtualauthor_records( va_id, tag='orig_name_string')[0]['value']) if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record( va_id, 'updated')) bconfig.LOGGER.log( 25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record( va_id, 'updated')) bconfig.LOGGER.log( 25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def add_virtualauthor(va_id, multi_va_to_ra=False): """ Adds a new virtual author to the real authors system: the idea is to search for possibly compatible real authors, then compare the compatibility of this virtual author with all the virtual authors connected to the selected real authors and add the new virtualauthor to the most compatible real author. In case we do not have a most compatible real author, we add the same virtual author to more then one real author with a lower probability; this behavior might be changed. @param va_id: Virtualauthor ID @type va_id: int """ addstart = time.time() adding_threshold = bconfig.REALAUTHOR_VA_ADD_THERSHOLD if adding_threshold == ["-1"]: adding_threshold = 0.7 already_existing = get_realauthors_by_virtuala_id(va_id) ralist = [] if len(already_existing) <= 0: start = time.time() va_cluster = bibauthorid_virtualauthor_utils.get_cluster_va_ids_from_va_id(va_id) ralist_raw = [] va_hash = hash(str(va_cluster)) if va_hash in dat.RA_VA_CACHE: ralist_raw = dat.RA_VA_CACHE[va_hash] bconfig.LOGGER.debug("|-> Cache Hit for va cluster") else: bconfig.LOGGER.debug("|-> Cache Fail--Generating new hash") ralist_raw = update_ralist_cache(va_cluster, va_hash) ralist = [ids["ra_id"] for ids in ralist_raw if ids["va_id"] != va_id] ralist = list(set(ralist)) if len(ralist) > 0: min_compatibilities = [] for i in ralist: compatibilities = [] compatibilities.append(cmp_virtual_to_real_author(va_id, i)) min_compatibilities.append(min(compatibilities)) max_min_compatibilities = max(min_compatibilities) if max_min_compatibilities < adding_threshold: bconfig.LOGGER.log( 25, "|-> Creating NEW real author for this" + " virtual author (compatibility below adding threshold" + " of other RAs).", ) create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) else: if min_compatibilities.count(max_min_compatibilities) == 1: index = min_compatibilities.index(max_min_compatibilities) add_realauthor_va(ralist[index], va_id, max_min_compatibilities) bconfig.LOGGER.log( 25, "|-> Adding to real author #%s" " with a compatability of %.2f" % (ralist[index], max_min_compatibilities), ) elif min_compatibilities.count(max_min_compatibilities) > 1: if multi_va_to_ra: bconfig.LOGGER.log(25, "|-> virtual author" " comaptible with more than one realauthor.") indexes = set() for i in xrange(len(min_compatibilities)): indexes.add(min_compatibilities.index(max_min_compatibilities, i)) bconfig.LOGGER.log( 25, "|-> virtual author" " will be attached to %s real authors" % (len(indexes)) ) for i in indexes: add_realauthor_va(ralist[i], va_id, max_min_compatibilities) bconfig.LOGGER.log( 25, "|--> Adding to real author" " #%s with a compatability of %.2f" % (ralist[i], max_min_compatibilities), ) else: bconfig.LOGGER.log( 25, "|-> virtual author" " comaptible with more than one realauthor..." "skipped for now." ) bconfig.LOGGER.log( 25, "|> The (skipped) comparison " "with %s real authors took %.2fs" % (len(ralist), time.time() - start), ) (bibauthorid_virtualauthor_utils.update_virtualauthor_record(va_id, "connected", "False")) (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) return else: bconfig.LOGGER.log( 25, "|-> Creating NEW real author for this" " Virtual Author (currently, no real author exists)" ) create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) (bibauthorid_virtualauthor_utils.update_virtualauthor_record(va_id, "connected", "True")) (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> The comparison with %s real authors took %.2fs" % (len(ralist), time.time() - addstart))
def add_virtualauthor(va_id, multi_va_to_ra=False): ''' Adds a new virtual author to the real authors system: the idea is to search for possibly compatible real authors, then compare the compatibility of this virtual author with all the virtual authors connected to the selected real authors and add the new virtualauthor to the most compatible real author. In case we do not have a most compatible real author, we add the same virtual author to more then one real author with a lower probability; this behavior might be changed. @param va_id: Virtualauthor ID @type va_id: int ''' addstart = time.time() adding_threshold = bconfig.REALAUTHOR_VA_ADD_THERSHOLD if adding_threshold == ["-1"]: adding_threshold = 0.7 already_existing = get_realauthors_by_virtuala_id(va_id) ralist = [] if len(already_existing) <= 0: start = time.time() va_cluster = (bibauthorid_virtualauthor_utils. get_cluster_va_ids_from_va_id(va_id)) ralist_raw = [] va_hash = hash(str(va_cluster)) if va_hash in dat.RA_VA_CACHE: ralist_raw = dat.RA_VA_CACHE[va_hash] bconfig.LOGGER.debug("|-> Cache Hit for va cluster") else: bconfig.LOGGER.debug("|-> Cache Fail--Generating new hash") ralist_raw = update_ralist_cache(va_cluster, va_hash) ralist = [ids['ra_id'] for ids in ralist_raw if ids['va_id'] != va_id] ralist = list(set(ralist)) if len(ralist) > 0: min_compatibilities = [] for i in ralist: compatibilities = [] compatibilities.append(cmp_virtual_to_real_author(va_id, i)) min_compatibilities.append(min(compatibilities)) max_min_compatibilities = max(min_compatibilities) if max_min_compatibilities < adding_threshold: bconfig.LOGGER.log( 25, "|-> Creating NEW real author for this" + " virtual author (compatibility below adding threshold" + " of other RAs).") create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) else: if min_compatibilities.count(max_min_compatibilities) == 1: index = min_compatibilities.index(max_min_compatibilities) add_realauthor_va(ralist[index], va_id, max_min_compatibilities) bconfig.LOGGER.log( 25, "|-> Adding to real author #%s" " with a compatability of %.2f" % (ralist[index], max_min_compatibilities)) elif min_compatibilities.count(max_min_compatibilities) > 1: if multi_va_to_ra: bconfig.LOGGER.log( 25, "|-> virtual author" " comaptible with more than one realauthor.") indexes = set() for i in xrange(len(min_compatibilities)): indexes.add( min_compatibilities.index( max_min_compatibilities, i)) bconfig.LOGGER.log( 25, "|-> virtual author" " will be attached to %s real authors" % (len(indexes))) for i in indexes: add_realauthor_va(ralist[i], va_id, max_min_compatibilities) bconfig.LOGGER.log( 25, "|--> Adding to real author" " #%s with a compatability of %.2f" % (ralist[i], max_min_compatibilities)) else: bconfig.LOGGER.log( 25, "|-> virtual author" " comaptible with more than one realauthor..." "skipped for now.") bconfig.LOGGER.log( 25, "|> The (skipped) comparison " "with %s real authors took %.2fs" % (len(ralist), time.time() - start)) (bibauthorid_virtualauthor_utils. update_virtualauthor_record(va_id, 'connected', 'False')) (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) return else: bconfig.LOGGER.log( 25, "|-> Creating NEW real author for this" " Virtual Author (currently, no real author exists)") create_new_realauthor(va_id) update_ralist_cache(va_cluster, va_hash) (bibauthorid_virtualauthor_utils.update_virtualauthor_record( va_id, 'connected', 'True')) (bibauthorid_virtualauthor_utils.delete_virtualauthor_record( va_id, 'updated')) bconfig.LOGGER.log( 25, "|> The comparison with %s real authors took %.2fs" % (len(ralist), time.time() - addstart))