def test_create_unified_name(self): """bibauthorid - test creation of unified name strings""" self.assertEqual('this, I. F. ', baidu.create_unified_name('this, isa fullname')) self.assertEqual('fullname, T. I. ', baidu.create_unified_name('this isa fullname')) self.assertEqual(', ', baidu.create_unified_name('')) self.assertEqual('Strange$![+{&]+)= Chars, T. ', baidu.create_unified_name('Strange$![+{&]+)= Chars, Twonames'))
def hash_coauthor_set(coauthors): ''' In case a collaboration are not tagged as such in the appropriate MARC21 field, this function will create a hash value for the list of authors after creating a sorted list of unified representations of the names. A collaboration is defined as a group of authors larger than the value MAX_COAUTHORS defined in the configuration file. MAX_COAUTHORS defaults to 60 people. @param coauthors: a list of names @type coauthors: list of strings @return: A hash representation of the sorted, unified list @rtype: string ''' hashlist = [] for i in coauthors: hashlist.append(create_unified_name(i).strip()) hashlist.sort() hashvalue = hash(str(hashlist)) return hashvalue
def get_clusterids_from_name(name, return_matching=False): ''' Returns a list of cluster IDs, which are fitting for the parameter 'name'. First checks if, in general, a cluster for this name exists. If not, create one. If there is a cluster, try to find all other fitting clusters and add the found cluster IDs to the list to be returned @param name: The name to be on the lookout for. @type name: string @param return_matching: also return the reference name's matching cluster @type return_matching: boolean @return: if return_matching: list of 1) list of cluster IDs 2) the cluster ID matching the name if not return_matching: list of cluster IDs @rtype: if return_matching: list of (list of int, int) if not return_matching: list of int ''' search_string = create_unified_name(name) search_string = clean_name_string(search_string) if len(search_string) > 150: search_string = search_string[:150] clusterids = set() matching_cluster = -1 initials = "" split_string = "" if search_string[:-1].count(",") > 0: split_string = search_string[:-1].replace(' ', '').split(',') if split_string[1]: initials = split_string[1].split('.') if len(initials) > 2 and len(initials) <= 5: permutation_list = initials permutation_base = ("%s, %s." % (search_string.split(',')[0], permutation_list[0])) for permutation in permutations(permutation_list[1:]): name_string = "%s %s." % (permutation_base, ". ".join(permutation)) clusters = _get_clusterids_from_name(name_string, return_matching) if return_matching: matching_cluster = clusters[1] for clusterid in clusters[0]: clusterids.add(clusterid) else: for clusterid in clusters: clusterids.add(clusterid) else: clusters = _get_clusterids_from_name(search_string, return_matching) if return_matching: matching_cluster = clusters[1] clusterids = clusters[0] else: clusterids = clusters if return_matching: return [clusterids, matching_cluster] else: return clusterids
def get_information_from_dataset(va_id, ra_id=-1): ''' Retrieves information about the coauthors/collaboration attachment of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the coauthors will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of coauthors OR the name of a collaboration @rtype: True if ra_id > -1 or list of strings or string ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" authorname_id = -1 for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) coauthors = get_field_values_on_condition(bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") collaboration = get_field_values_on_condition(bibrec_id, "710", "g") if (not coauthors) and (not collaboration): bconfig.LOGGER.info("|-> No coauthors and no collaboration found " "for this author on this record") elif not ra_id: if collaboration: bconfig.LOGGER.info("|-> Collaboration found: %s" % (list(collaboration)[0])) else: bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors))) max_coauthors = MAX_COAUTHORS if ra_id > -1: if collaboration: cname = list(collaboration)[0] coauthor_formatted = create_unified_name(cname.lower()) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: if len(coauthors) <= max_coauthors: for coauthor in coauthors: coauthor_formatted = create_unified_name(coauthor.lower()) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: hashvalue = hash_coauthor_set(coauthors) bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve" " information, a hash will be stored: %s" % (max_coauthors, hashvalue)) set_realauthor_data( ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], hashvalue)) return True else: if collaboration: return collaboration else: return coauthors
def compare_va_to_ra(va_id, ra_id): ''' Compares the coauthors of a virtual author with all the coauthors of a real author. If a collaboration is detected on both sides, these collaboration detachments will be compared as well. @param va_id: Virtual author ID @type va_id: int @param ra_id: Real author ID @type ra_id: int @return: the probability of the virtual author belonging to the real author @rtype: float ''' bconfig.LOGGER.info( "|-> Start of coauthorship comparison (va %s : ra %s)" % (va_id, ra_id)) ra_coauth_set = set() ra_coauthors_data = get_realauthor_data(ra_id, "coauthor") va_coauth_set = get_information_from_dataset(va_id) va_coauth_set_format = set() # max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0]) max_coauthors = MAX_COAUTHORS if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0): bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)") return 0 if (len(va_coauth_set) > max_coauthors): bconfig.LOGGER.info("|--> Many coauthors found. Will try hash" + " values for collaboration testing.") hashed = str(hash_coauthor_set(va_coauth_set)) for coauthor_data in ra_coauthors_data: if coauthor_data['value'].split(";;")[1] == hashed: bconfig.LOGGER.info("|---> Hash found! Assuming " "collaboration attachment.") return 1.0 bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.") return 0 for rcoauthor_data in ra_coauthors_data: ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1]) for vcoauthor_data in va_coauth_set: va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower())) parity = ra_coauth_set.intersection(va_coauth_set_format) certainty = 0 for collaborationsearch in parity: if collaborationsearch.count("ollaboration"): bconfig.LOGGER.info("|--> Found matching collaboration: %s" % (collaborationsearch)) return 1.0 if len(va_coauth_set) > 0: certainty = 1 - exp(-.8 * pow(len(parity), .7)) bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s " "on the paper. Result: %s%% similarity" % (len(parity), len(va_coauth_set), certainty)) return certainty
def get_information_from_dataset(va_id, ra_id= -1): ''' Retrieves information about the coauthors/collaboration attachment of a virtual author from the data set. In dependency of the real author ID, the information will be written to the real author holding this ID. If the real author ID should be the default '-1', a list with all the coauthors will be returned. @param va_id: Virtual author ID to get the information from @type va_id: int @param ra_id: Real author ID to set information for. @type ra_id: int @return: True, if ra_id is set OR A list of coauthors OR the name of a collaboration @rtype: True if ra_id > -1 or list of strings or string ''' va_data = get_virtualauthor_records(va_id) bibrec_id = "" authorname_id = -1 for va_data_item in va_data: if va_data_item['tag'] == "bibrec_id": bibrec_id = va_data_item['value'] elif va_data_item['tag'] == "orig_authorname_id": authorname_id = va_data_item['value'] authorname_strings = get_name_and_db_name_strings(authorname_id) bconfig.LOGGER.info("| Reading coauthors for va %s: %s recid %s" % (va_id, authorname_strings["name"], bibrec_id)) coauthors = get_field_values_on_condition( bibrec_id, ['100', '700'], 'a', 'a', authorname_strings["db_name"], "!=") collaboration = get_field_values_on_condition(bibrec_id, "710", "g") if (not coauthors) and (not collaboration): bconfig.LOGGER.info("|-> No coauthors and no collaboration found " "for this author on this record") elif not ra_id: if collaboration: bconfig.LOGGER.info("|-> Collaboration found: %s" % (list(collaboration)[0])) else: bconfig.LOGGER.info("|-> Coauthors found: %s" % (len(coauthors))) max_coauthors = MAX_COAUTHORS if ra_id > -1: if collaboration: cname = list(collaboration)[0] coauthor_formatted = create_unified_name(cname.lower()) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: if len(coauthors) <= max_coauthors: for coauthor in coauthors: coauthor_formatted = create_unified_name(coauthor.lower()) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], coauthor_formatted)) else: hashvalue = hash_coauthor_set(coauthors) bconfig.LOGGER.info("|--> Coauthor # > %s. To preserve" " information, a hash will be stored: %s" % (max_coauthors, hashvalue)) set_realauthor_data(ra_id, "coauthor", "%s;;%s" % (authorname_strings["name"], hashvalue)) return True else: if collaboration: return collaboration else: return coauthors
def compare_va_to_ra(va_id, ra_id): ''' Compares the coauthors of a virtual author with all the coauthors of a real author. If a collaboration is detected on both sides, these collaboration detachments will be compared as well. @param va_id: Virtual author ID @type va_id: int @param ra_id: Real author ID @type ra_id: int @return: the probability of the virtual author belonging to the real author @rtype: float ''' bconfig.LOGGER.info("|-> Start of coauthorship comparison (va %s : ra %s)" % (va_id, ra_id)) ra_coauth_set = set() ra_coauthors_data = get_realauthor_data(ra_id, "coauthor") va_coauth_set = get_information_from_dataset(va_id) va_coauth_set_format = set() # max_coauthors = int(get_config_parameter('MAX_COAUTHORS')[0]) max_coauthors = MAX_COAUTHORS if (len(ra_coauthors_data) == 0) and (len(va_coauth_set) == 0): bconfig.LOGGER.info("|-> End of coauthorship comparison (Sets empty)") return 0 if (len(va_coauth_set) > max_coauthors): bconfig.LOGGER.info("|--> Many coauthors found. Will try hash" + " values for collaboration testing.") hashed = str(hash_coauthor_set(va_coauth_set)) for coauthor_data in ra_coauthors_data: if coauthor_data['value'].split(";;")[1] == hashed: bconfig.LOGGER.info("|---> Hash found! Assuming " "collaboration attachment.") return 1.0 bconfig.LOGGER.info("|---> Hash NOT found. Skipping metric.") return 0 for rcoauthor_data in ra_coauthors_data: ra_coauth_set.add(rcoauthor_data['value'].split(";;")[1]) for vcoauthor_data in va_coauth_set: va_coauth_set_format.add(create_unified_name(vcoauthor_data.lower())) parity = ra_coauth_set.intersection(va_coauth_set_format) certainty = 0 for collaborationsearch in parity: if collaborationsearch.count("ollaboration"): bconfig.LOGGER.info("|--> Found matching collaboration: %s" % (collaborationsearch)) return 1.0 if len(va_coauth_set) > 0: certainty = 1 - exp(-.8 * pow(len(parity), .7)) bconfig.LOGGER.info("|--> Found %s matching coauthors out of %s " "on the paper. Result: %s%% similarity" % (len(parity), len(va_coauth_set), certainty)) return certainty