def do_upgrade(): logger = Logger("Rabbit m_name upgrade script") warnings.filterwarnings('ignore') run_sql( "alter table aidPERSONIDPAPERS add `m_name` VARCHAR(255) not null after name" ) run_sql("alter table aidPERSONIDPAPERS add INDEX `m_name-b` (`m_name`)") present_bibrefs = set( run_sql("select bibref_table, bibref_value from aidPERSONIDPAPERS")) total_updates = len(present_bibrefs) records_for_rabbit = set() for i, bibref in enumerate(present_bibrefs): logger.update_status( float(i) / total_updates, '%s out of %s (%s)' % (str(i), str(total_updates), str(bibref))) try: name = get_name_by_bibref(bibref) except AssertionError, error: if "A bibref must have exactly one name" in error.message: records_for_rabbit.add(bibref[1]) else: raise error else: m_name = create_matchable_name(name) run_sql( "update aidPERSONIDPAPERS set name=%s, m_name=%s where bibref_table=%s " "and bibref_value=%s ", (name, m_name, bibref[0], bibref[1]))
class BibauthoridBaseMergerTestCase(InvenioTestCase): def setUp(self): self.verbose = 0 self.logger = Logger(self.__class__.__name__) # TODO check self.logger.log("Setting up regression tests...") self.first_author_name = "Testsurname, Firstperson" self.second_author_name = "Testsurname, Secondperson" self.cluster = self.first_author_name.split(',')[0].lower() self.sigs = list() self.author_id_one = get_free_author_id() self.author_id_two = self.author_id_one + 1 self.query = """insert into aidRESULTS (personid, bibref_table, bibref_value, bibrec) values (%s, %s, %s, %s)""" self.merge_func_to_use = merge_dynamic # TOD abstract in main def merge_func(self): @patch('invenio.bibauthorid_merge.get_cluster_names') def mocked_merge(mocked_func): mocked_func.return_value = self.get_test_cluster_names() self.merge_func_to_use() mocked_merge() def get_test_cluster_names(self): ''' Mock function replacing get_cluster_names. We only need our test names. ''' return set(run_sql("""select personid from aidRESULTS where personid like '%s%%'""" % self.cluster)) def assertMergeResults(self, recs_one, recs_two, non_deterministic=False): author_one_res = run_sql("select bibrec from aidPERSONIDPAPERS where personid = %s", (self.author_id_one,)) author_one_res = set([rec[0] for rec in author_one_res]) author_two_res = run_sql("select bibrec from aidPERSONIDPAPERS where personid = %s", (self.author_id_two,)) author_two_res = set([rec[0] for rec in author_two_res]) try: self.assertEquals(author_one_res, recs_one) except AssertionError, e: if non_deterministic: self.assertEquals(author_two_res, recs_one) self.assertEquals(author_one_res, recs_two) else: raise e else:
def setUp(self): self.verbose = 0 self.logger = Logger(self.__class__.__name__) # TODO check self.logger.log("Setting up regression tests...") self.first_author_name = "Testsurname, Firstperson" self.second_author_name = "Testsurname, Secondperson" self.cluster = self.first_author_name.split(',')[0].lower() self.sigs = list() self.author_id_one = get_free_author_id() self.author_id_two = self.author_id_one + 1 self.query = """insert into aidRESULTS (personid, bibref_table, bibref_value, bibrec) values (%s, %s, %s, %s)""" self.merge_func_to_use = merge_dynamic # TOD abstract in main
def __init__(self, name, cluster_set=None, storage_dir_override=None): self.name = name self._f = None self._matrix = None self._use_temporary_file = True self._size = None self._storage_dir_override = storage_dir_override if cluster_set: self._bibmap = dict( (b[1], b[0]) for b in enumerate(cluster_set.all_bibs())) width = len(self._bibmap) self._size = ((width + 1) * width) / 2 else: self._bibmap = dict() self._matrix = None self.creation_time = get_db_time() self.logger = Logger("bib_matrix")
def do_upgrade(): logger = Logger("aidPERSONIDPAPERS_duplicates") logger.log('Removing duplicate entries in aidPERSONIDPAPERS...') duplicates = 0 while True: # Needed because there may be >1 duplicates. duplicate_entries = run_sql('select * ' 'from aidPERSONIDPAPERS ' 'group by personid, bibref_table, ' 'bibref_value, bibrec, flag, ' 'lcul, last_updated ' 'having count(*) > 1') if not duplicate_entries: break for entry in duplicate_entries: run_sql('delete from aidPERSONIDPAPERS ' 'where personid = %s and ' 'bibref_table = %s and ' 'bibref_value = %s and ' 'bibrec = %s and ' 'name = %s and ' 'm_name = %s and ' 'flag = %s and ' 'lcul = %s and ' 'last_updated = %s ' 'limit 1', entry) duplicates += len(duplicate_entries) logger.log("""%s duplicate entries removed in aidPERSONIDPAPERS.""" % duplicates)
class Bib_matrix(object): ''' Contains the sparse matrix and encapsulates it. ''' # please increment this value every time you # change the output of the comparison functions current_comparison_version = 0 __special_items = ((None, -3.), ('+', -2.), ('-', -1.)) special_symbols = dict((x[0], x[1]) for x in __special_items) special_numbers = dict((x[1], x[0]) for x in __special_items) def __init__(self, name, cluster_set=None, storage_dir_override=None): self.name = name self._f = None self._matrix = None self._use_temporary_file = True self._size = None self._storage_dir_override = storage_dir_override if cluster_set: self._bibmap = dict( (b[1], b[0]) for b in enumerate(cluster_set.all_bibs())) width = len(self._bibmap) self._size = ((width + 1) * width) / 2 else: self._bibmap = dict() self._matrix = None self.creation_time = get_db_time() self.logger = Logger("bib_matrix") def _initialize_matrix(self): self.open_h5py_file() self._matrix = self._f.create_dataset("array", (self._size, 2), 'f') self._matrix[...] = self.special_symbols[None] def _resolve_entry(self, bibs): first, second = bibs first, second = self._bibmap[first], self._bibmap[second] if first > second: first, second = second, first return first + (second * second + second) / 2 def __setitem__(self, bibs, val): entry = self._resolve_entry(bibs) try: self._matrix[entry] = Bib_matrix.special_symbols.get(val, val) except TypeError: self._initialize_matrix() self._matrix[entry] = Bib_matrix.special_symbols.get(val, val) def __getitem__(self, bibs): entry = self._resolve_entry(bibs) try: ret = self._matrix[entry] except TypeError: self._initialize_matrix() ret = self._matrix[entry] return Bib_matrix.special_numbers.get(ret[0], tuple(ret)) def getitem_numeric(self, bibs): return self._matrix[self._resolve_entry(bibs)] def __contains__(self, bib): return bib in self._bibmap def get_keys(self): return self._bibmap.keys() def get_file_dir(self): if self._storage_dir_override: return self._storage_dir_override sub_dir = self.name[:2] if not sub_dir: sub_dir = "empty_last_name" return "%s%s/" % (bconfig.TORTOISE_FILES_PATH, sub_dir) def get_map_path(self): return "%s%s-bibmap.pickle" % (self.get_file_dir(), self.name) def get_matrix_path(self): path = "%s%s.hdf5" % (self.get_file_dir(), self.name) if self._use_temporary_file: path = path + '.tmp' return path def open_h5py_file(self, create_empty_on_failure=True): self._prepare_destination_directory() path = self.get_matrix_path() try: self._f = h5py.File(path) except IOError as e: # If the file is corrupted h5py fails with IOErorr. # Give it a second try with an empty file before raising. if create_empty_on_failure: os.remove(path) self._f = h5py.File(path) else: raise e def load(self): self._use_temporary_file = False files_dir = self.get_file_dir() if not os.path.isdir(files_dir): self._bibmap = dict() self._matrix = None return False try: with open(self.get_map_path(), 'r') as fp: bibmap_v = load(fp) rec_v, self.creation_time, self._bibmap = bibmap_v # pylint: disable=W0612 # if (rec_v != Bib_matrix.current_comparison_version or # you can use negative version to recalculate # Bib_matrix.current_comparison_version < 0): # self._bibmap = dict() self._use_temporary_file = False if self._f: self._f.close() self.open_h5py_file(create_empty_on_failure=False) self._matrix = self._f['array'] except (IOError, UnpicklingError, KeyError, OSError) as e: if e.errno == errno.ENOENT: # The file has not been created yet. If this the first time bib_matrix runs, it is fine. self.logger.log("Warning: The bibmap serialized file ", self.get_map_path(), "is not present. Will not load bibmap.") else: self.logger.log( 'Unexpected error occurred while loading bibmap, cleaning... ', str(type(e)), str(e)) self._bibmap = dict() self._matrix = None try: os.remove(self.get_map_path()) except OSError: pass try: os.remove(self.get_matrix_path()) except OSError: pass self._use_temporary_file = True try: os.remove(self.get_matrix_path()) except OSError: pass self._bibmap = dict() self._matrix = None self._use_temporary_file = True return False return True def _prepare_destination_directory(self): files_dir = self.get_file_dir() if not os.path.isdir(files_dir): try: os.mkdir(files_dir) except OSError as e: if e.errno == 17 or 'file exists' in str(e.strerror).lower(): pass else: raise e def store(self): # save only if we are not completey empty: if self._bibmap: self._prepare_destination_directory() bibmap_v = (Bib_matrix.current_comparison_version, self.creation_time, self._bibmap) with open(self.get_map_path(), 'w') as fp: dump(bibmap_v, fp) if not self._matrix: self._initialize_matrix() if self._f: self._f.close() if self._use_temporary_file: curpath = self.get_matrix_path() self._use_temporary_file = False finalpath = self.get_matrix_path() try: os.rename(curpath, finalpath) except OSError as e: raise e def duplicate_existing(self, name, newname): ''' Make sure the original Bib_matrix have been store()-ed before calling this! ''' self._use_temporary_file = False self.name = name srcmap = self.get_map_path() srcmat = self.get_matrix_path() self.name = newname dstmap = self.get_map_path() dstmat = self.get_matrix_path() shutil.copy(srcmap, dstmap) shutil.copy(srcmat, dstmat) def destroy(self): if self._f: self._f.close() try: os.remove(self.get_map_path()) except OSError: pass try: os.remove(self.get_matrix_path()) except OSError: pass self._use_temporary_file = True try: os.remove(self.get_matrix_path()) except OSError: pass self._bibmap = dict() self._matrix = None
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. # # This has been temporarily deprecated, please use schedule_workes from general utils instead# # import re import os import sys from itertools import dropwhile, chain from invenio.bibauthorid_general_utils import print_tortoise_memory_log from invenio import bibauthorid_config as bconfig from invenio.bibauthorid_general_utils import is_eq from invenio.bibauthorid_logutils import Logger logger = Logger("scheduler") # python2.4 compatibility from invenio.bibauthorid_general_utils import bai_all as all def to_number(stry): return int(re.sub("\D", "", stry)) def dict_by_file(fpath): fp = open(fpath) content = fp.read() fp.close() return dict(x.split(':') for x in content.split("\n")[:-1])
def rabbit(bibrecs=None, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): logger = Logger("Rabbit") if verbose: logger.verbose = True if not bibrecs: logger.log("Running on all records") else: logger.log("Running on %s " % (str(bibrecs))) populate_mnames_pids_cache() global M_NAME_PIDS_CACHE memoized_compare_names = memoized(comp_names) compare_names = lambda x, y: memoized_compare_names(*sorted((x, y))) def find_pids_by_matchable_name_with_cache(matchable_name): try: matched_pids = [M_NAME_PIDS_CACHE[matchable_name]] except KeyError: matched_pids = get_authors_by_name(matchable_name, use_matchable_name=True) if matched_pids: M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0] return matched_pids if USE_EXT_IDS: def get_matched_pids_by_external_ids(sig, rec, pids_having_rec): ''' This function returns all the matched pids after iterating through all available external IDs of the system. ''' for get_external_id_of_signature in external_id_getters: external_id = get_external_id_of_signature(sig + (rec, )) if external_id: matched_pids = list( get_author_by_external_id(external_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() return matched_pids threshold = 0.8 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) bibrecs = list(bibrecs) for idx, rec in enumerate(bibrecs): logger.log("Considering %s" % str(rec)) if idx % 100 == 0: task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if idx % 1000 == 0: destroy_partial_marc_caches() populate_partial_marc_caches(bibrecs[idx:idx + 1000]) logger.log( float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs))) if rec in deleted: remove_papers([rec]) continue author_refs = get_author_refs_of_paper(rec) coauthor_refs = get_coauthor_refs_of_paper(rec) markrefs = frozenset( chain(izip(cycle([100]), imap(itemgetter(0), author_refs)), izip(cycle([700]), imap(itemgetter(0), coauthor_refs)))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, get_name_by_bibref(new)) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logger.log(" - Deleted signatures: %s" % str(old_signatures)) logger.log(" - Added signatures: %s" % str(new_signatures)) logger.log(" - Matrix: %s" % str(matrix)) #[new_signatures, old_signatures] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logger.log(" - Best match: %s " % str(best_match)) for new, old in best_match: logger.log(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new])) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) remaining_personid_rows = ([ x for x in personid_rows if x[1:3] in old_signatures ]) pids_having_rec = set([int(row[0]) for row in remaining_personid_rows]) logger.log(" - Not matched: %s" % str(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matchable_name = create_matchable_name(name) matched_pids = list() if USE_EXT_IDS: matched_pids = get_matched_pids_by_external_ids( sig, rec, pids_having_rec) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0], m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0] updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if not matched_pids: for matching_function in M_NAME_FUNCTIONS[1:]: matchable_name = matching_function(name) matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if matched_pids: break matched_pids = [p for p in matched_pids if int(p) not in used_pids] best_matched_pid = None for matched_pid in matched_pids: # Because of the wrongly labeled data in the db, all # of the possible choices have to be checked. If one of the # coauthors, who had his signature already considered, claimed # in the past one of the signatures of currently considered # author, the algorithm will think that two signatures belong # to the same person, and, will create an unnecessary new # profile. if not int(matched_pid) in pids_having_rec: best_matched_pid = matched_pid break if not best_matched_pid: new_pid = new_person_from_signature( list(sig) + [rec], name, matchable_name) M_NAME_PIDS_CACHE[matchable_name] = new_pid used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, best_matched_pid, m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid used_pids.add(best_matched_pid) updated_pids.add(best_matched_pid) pids_having_rec.add(best_matched_pid) logger.log('Finished with %s' % str(rec)) logger.update_status_final() destroy_partial_marc_caches() if personids_to_update_extids: updated_pids |= set(personids_to_update_extids) if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS, force_cache_tables=True) destroy_partial_marc_caches() destroy_mnames_pids_cache() remove_empty_authors() task_update_progress("Done!")
from invenio.bibauthorid_backinterface import update_canonical_names_of_authors from invenio.bibauthorid_backinterface import get_cluster_names from invenio.bibauthorid_backinterface import get_clusters_by_surname from invenio.bibauthorid_backinterface import get_author_info_of_confirmed_paper from invenio.bibauthorid_backinterface import get_author_and_status_of_confirmed_paper from invenio.bibauthorid_backinterface import move_signature from invenio.bibauthorid_backinterface import get_claimed_papers_of_author from invenio.bibauthorid_backinterface import get_free_author_id from invenio.bibauthorid_backinterface import get_signatures_of_paper_and_author from invenio.bibauthorid_backinterface import get_free_author_ids as backinterface_get_free_pids from invenio.bibauthorid_backinterface import get_ordered_author_and_status_of_signature from invenio.bibauthorid_backinterface import remove_empty_authors from invenio.bibauthorid_backinterface import get_paper_to_author_and_status_mapping from invenio.bibauthorid_backinterface import get_authors_by_surname logger = Logger("merge") def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag))
from invenio.bibauthorid_dbinterface import get_name_by_bibref from invenio.bibauthorid_dbinterface import get_grouped_records from invenio.bibauthorid_dbinterface import get_authors_of_paper from invenio.bibauthorid_dbinterface import get_collaborations_for_paper from invenio.bibauthorid_dbinterface import get_resolved_affiliation from invenio.bibauthorid_backinterface import get_keywords_for_paper from invenio.bibrank_citation_searcher import get_cited_by, get_refers_to # metadat_comparison_print commented everywhere to increase performances, # import and calls left here to make future debug easier. from invenio.bibauthorid_logutils import Logger import gc import random CFG_MEMOIZE_DICT_SIZE = 1000000 logger = Logger('metadata_comparison', verbose=bconfig.DEBUG_METADATA_COMPARISON_OUTPUT) # This module is not thread safe! # Be sure to use processes instead of # threads if you need parallel # computation! use_refrec = itemgetter(slice(None)) use_ref = itemgetter(0, 1) use_rec = itemgetter(2) use_string = lambda x: x CACHES = list() def create_new_cache():
from math import sqrt from invenio.textutils import translate_to_ascii as original_translate_to_ascii translate_to_ascii = memoized(original_translate_to_ascii) SQRT2 = sqrt(2) try: from invenio.config import CFG_ETCDIR NO_CFG_ETCDIR = False except ImportError: NO_CFG_ETCDIR = True from Levenshtein import distance logger = Logger("name comparison", verbose=bconfig.DEBUG_NAME_COMPARISON_OUTPUT) artifact_removal = re.compile("[^a-zA-Z0-9]") surname_cleaning = re.compile("-([a-z])") name_additions_chars = re.compile("\([.]*[^\)]*\)") name_separators = bconfig.NAMES_SEPARATOR_CHARACTER_LIST if name_separators == "-1": name_separators = ',;.=\-\(\)' substitution_regexp = re.compile('[%s]' % (name_separators)) # Gender names and names variation files are loaded updon module import to increase performances @memoized def split_name_parts(name_string,
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import warnings from invenio.dbquery import run_sql from invenio.bibauthorid_logutils import Logger from invenio.bibauthorid_dbinterface import get_name_by_bibref, create_matchable_name from invenio.bibauthorid_rabbit import rabbit depends_on = ['invenio_2013_11_28_bibauthorid_search_engine_column_changes'] Logger.override_verbosity(True) def info(): return "Updates the columns of aidPERSONIDPAPERS, adds the rabbit matchable name, assigns default value to columns." def do_upgrade(): logger = Logger("Rabbit m_name upgrade script") warnings.filterwarnings('ignore') run_sql( "alter table aidPERSONIDPAPERS add `m_name` VARCHAR(255) not null after name" )
# import pyximport; pyximport.install() from invenio.bibauthorid_bib_matrix import Bib_matrix if bconfig.DEBUG_CHECKS: def _debug_is_eq_v(vl1, vl2): if isinstance(vl1, str) and isinstance(vl2, str): return vl1 == vl2 if isinstance(vl1, tuple) and isinstance(vl2, tuple): return is_eq(vl1[0], vl2[0]) and is_eq(vl1[1], vl2[1]) return False logger = Logger("prob_matrix") class ProbabilityMatrix(object): ''' This class contains and maintains the comparison between all virtual authors. It is able to write and read from the database and update the results. ''' def __init__(self, name): self._bib_matrix = Bib_matrix(name) def load(self, load_map=True, load_matrix=True): logger.update_status(0., "Loading probability matrix...") self._bib_matrix.load() logger.update_status_final("Probability matrix loaded.")
import gc import cPickle SP_NUMBERS = Bib_matrix.special_numbers SP_SYMBOLS = Bib_matrix.special_symbols SP_CONFIRM = Bib_matrix.special_symbols['+'] SP_QUARREL = Bib_matrix.special_symbols['-'] eps = 0.01 edge_cut_prob = '' wedge_thrsh = '' h5file = None logger = Logger("wedge", verbose=bconfig.DEBUG_WEDGE_OUTPUT) import os PID = lambda: str(os.getpid()) import pyximport pyximport.install() from invenio.bibauthorid_meld_edges import meld_edges def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False): # The lower bound of the edges being processed by the wedge algorithm. global edge_cut_prob global wedge_thrsh if not force_wedge_thrsh:
# You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. from itertools import chain, groupby, izip, cycle from operator import itemgetter from invenio.bibauthorid_matrix_optimization import maximized_mapping from invenio.bibauthorid_backinterface import save_cluster from invenio.bibauthorid_backinterface import get_confirmed_papers_of_authors from invenio.bibauthorid_backinterface import get_bib10x, get_bib70x from invenio.bibauthorid_backinterface import get_author_to_confirmed_names_mapping from invenio.bibauthorid_backinterface import get_signatures_from_bibrefs from invenio.bibauthorid_name_utils import generate_last_name_cluster_str from invenio.bibauthorid_logutils import Logger logger = Logger("cluster_set") class Blob(object): def __init__(self, personid_records): ''' @param personid_records: A list of tuples: (personid, bibrefrec, flag). Notice that all bibrefrecs should be the same since the Blob represents only one bibrefrec. ''' self.bib = personid_records[0][1] assert all(p[1] == self.bib for p in personid_records), \ "All cluster sets should share the bibrefrec" self.claimed = set() self.assigned = set()
from invenio.bibauthorid_logutils import Logger from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_marktables from invenio.bibauthorid_cluster_set import delayed_cluster_sets_from_personid from invenio.bibauthorid_wedge import wedge from invenio.bibauthorid_name_utils import generate_last_name_cluster_str from invenio.bibauthorid_backinterface import empty_tortoise_results_table from invenio.bibauthorid_backinterface import remove_clusters_by_name from invenio.bibauthorid_prob_matrix import prepare_matrix # Scheduler is [temporarily] deprecated in favour of the much simpler schedule_workers # from invenio.bibauthorid_scheduler import schedule, matrix_coefs from invenio.bibauthorid_general_utils import schedule_workers logger = Logger("tortoise") ''' There are three main entry points to tortoise i) tortoise Performs disambiguation iteration. The arguemnt pure indicates whether to use the claims and the rejections or not. Use pure=True only to test the accuracy of tortoise. ii) tortoise_from_scratch NOT RECOMMENDED! Use this function only if you have just installed invenio and this is your first disambiguation or if personid is broken.