def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge clsuter set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = [] for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed" assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization interval = 10000 for current, c1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge" pointers = [] for v1 in c1.bibs: pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') pointer.fill(special_symbols[None]) rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): for v2 in c2.bibs: val = prob_matr[rm, result_mapping[v2]] try: numb = special_symbols[val] val = (numb, numb) except KeyError: pass assert len(val) == 2, "Edge coding failed" pointer[v2] = val pointers.append((pointer, 1)) c1.out_edges = reduce(meld_edges, pointers)[0] update_status_final("Converting the cluster set done.") gc.enable()
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc+opti - prints_counter > 100000: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc+opti #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p: -2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict( (sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) personid = get_bibrefrec_to_pid_flag_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict( (pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) update_status_final("Merging done.") update_status_final() delete_empty_persons() update_personID_canonical_names()
def store(self, name): update_status(0., "Saving probability matrix...") self._bib_matrix.store(name) update_status_final("Probability matrix saved.")
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w') minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w') pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0.,0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id), bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
raise Exception("""Error happened in convert_cluster_set with v1: %s, real_pointer: %s, pointer: %s, pointers: %s, result_mapping: %s, index: %s, len(real_pointer): %s, len(pointer): %s, len(pointers): %s, original_exception: %s """%(str(v1),str(real_pointer),str(pointer), str(pointers), str(result_mapping), str(index), str(len(real_pointer)), str(len(pointer)), str(len(pointers)), str(e)) ) update_status_final("Converting the cluster set done.") #gc.enable() def restore_cluster_set(cs): for cl in cs.clusters: cl.bibs = set(cs.new2old[b] for b in cl.bibs) cs.update_bibs() def create_bib_2_cluster_dict(cs): ''' Creates and returns a dictionary bibrefrec -> cluster. The cluster set must be converted! ''' size = sum(len(cl.bibs) for cl in cs.clusters)
def merge_dynamic(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is dynamic: it allows aid* tables to be changed while it is still running, hence the claiming faciity for example can stay online during the merge. This comfort however is paid off in term of speed. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() def try_move_signature(sig, target_pid): """ """ paps = get_ordered_author_and_status_of_signature(sig) rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if int(target_pid) in [int(x[0]) for x in rejected]: move_signature(sig, free_pids.next()) else: conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_signature(sig, target_pid) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: move_signature(sig, free_pids.next()) else: move_signature(conflicts[0][:3], free_pids.next()) move_signature(sig, target_pid) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = list() for d in ds: pid_flag = get_author_and_status_of_confirmed_paper(d) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) #best_match = cluster,pid_idx,n best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, score in best_match if score > 0] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), [x for x in best_match if x[2] > 0])) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: try_move_signature(sig, pid) update_status_final() remove_empty_authors() update_canonical_names_of_authors()
pid = os.fork() if pid == 0: # child os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def load(self, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load() update_status_final("Probability matrix loaded.")
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = [ '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid') ] fnum = float(len(files)) quanta = .1 / fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict( lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.])) coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i, c in enumerate(cn): update_status(i / l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i, fi in enumerate(files): if generate_graphs: if i % 1000 == 0: gen_graphs(True) f = open(fi, 'r') status = i / fnum update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status + 0.2 * quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len( cur_coeffs ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all( [x >= 0 and x <= 1 for x in cur_coeffs] ), "Error, a coefficient is wrong here! Check me! %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs) / cur_clen update_status(status + 0.4 * quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi + 1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi + cur_avg) / (avi + 1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi + cur_clustnumber) / (avi + 1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / ( avi + 1) update_status(status + 0.6 * quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi + 1 cluster_stats[cur_clust][cur_coef][1] = ( cluster_stats[cur_clust][cur_coef][1] * avi + cur_avg) / (avi + 1) cluster_stats[cur_clust][cur_coef][2] = min( cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max( cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = ( cluster_stats[cur_clust][cur_coef][4] * avi + cur_clustnumber) / (avi + 1) cluster_stats[cur_clust][cur_coef][5] = ( cluster_stats[cur_clust][cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / (avi + 1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0, 'Dumping to file...') f = open(pickle_output, 'w') SER.dump( { 'cluster_stats': dict((x, dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats': dict((coeff_stats)) }, f) f.close()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'w') minus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'w') pairs_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), 'w') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0., 0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status( float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[ None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print( "Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set, original_process_id)) p.start() p.join() plus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'r') minus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'r') edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), 'r') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'r') len_plus, len_minus, len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status( float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID( ) + "Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print( "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s" % (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id)) except: pass
"""Error happened in convert_cluster_set with v1: %s, real_pointer: %s, pointer: %s, pointers: %s, result_mapping: %s, index: %s, len(real_pointer): %s, len(pointer): %s, len(pointers): %s, original_exception: %s """ % (str(v1), str(real_pointer), str(pointer), str(pointers), str(result_mapping), str(index), str(len(real_pointer)), str(len(pointer)), str(len(pointers)), str(e))) update_status_final("Converting the cluster set done.") #gc.enable() def restore_cluster_set(cs): for cl in cs.clusters: cl.bibs = set(cs.new2old[b] for b in cl.bibs) cs.update_bibs() def create_bib_2_cluster_dict(cs): ''' Creates and returns a dictionary bibrefrec -> cluster. The cluster set must be converted! ''' size = sum(len(cl.bibs) for cl in cs.clusters)
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): import matplotlib.pyplot as plt plt.ioff() def _gen_plot(data, filename): plt.clf() ax = plt.subplot(111) ax.grid(visible=True) x = sorted(data.keys()) w = [data[k][0] for k in x] try: wscf = max(w) except: wscf = 0 w = [float(i)/wscf for i in w] y = [data[k][1] for k in x] maxi = [data[k][3] for k in x] mini = [data[k][2] for k in x] lengs = [data[k][4] for k in x] try: ml = float(max(lengs)) except: ml = 1 lengs = [k/ml for k in lengs] normalengs = [data[k][5] for k in x] ax.plot(x,y,'-o',label='avg') ax.plot(x,maxi,'-o', label='max') ax.plot(x,mini,'-o', label='min') ax.plot(x,w, '-x', label='norm %s' % str(wscf)) ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml))) ax.plot(x,normalengs, '-o', label='ncl') plt.ylim(ymax = 1., ymin = -0.01) plt.xlim(xmax = 1., xmin = -0.01) ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.) plt.savefig(filename) override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = filehandler.open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
class ProbabilityMatrix(object): ''' This class contains and maintains the comparison between all virtual authors. It is able to write and read from the database and update the results. ''' def __init__(self, name): self._bib_matrix = Bib_matrix(name) def load(self, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load() update_status_final("Probability matrix loaded.") def store(self): update_status(0., "Saving probability matrix...") self._bib_matrix.store() update_status_final("Probability matrix saved.") def __getitem__(self, bibs): return self._bib_matrix[bibs[0], bibs[1]] def getitem_numeric(self, bibs): return self._bib_matrix.getitem_numeric(bibs) def __get_up_to_date_bibs(self, bib_matrix): return frozenset( get_modified_papers_before(bib_matrix.get_keys(), bib_matrix.creation_time)) def is_up_to_date(self, cluster_set): return self.__get_up_to_date_bibs(self._bib_matrix) >= frozenset( cluster_set.all_bibs()) def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name + 'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name + 'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000 or cur_calc == 0: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """ % (str(val), str(e))) clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def store(self): update_status(0., "Saving probability matrix...") self._bib_matrix.store() update_status_final("Probability matrix saved.")
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name( split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec, )) if inspire_id: matched_pids = list( get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() current_mapping = get_paper_to_author_and_status_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map(itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx%200 == 0: task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite(" - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec,)) if inspire_id: matched_pids = list(get_author_by_external_id(inspire_id[0])) if matched_pids and int(matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p:-2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict((sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) personid = get_paper_to_author_and_status_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict((pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) update_status_final("Merging done.") update_status_final() remove_empty_authors() update_canonical_names_of_authors()
os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status( done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set,original_process_id)) p.start() p.join() plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r') minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r') edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r') len_plus,len_minus,len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id)) except: pass
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) def get_free_pids(): while True: yield get_new_personid() free_pids = get_free_pids() current_mapping = get_bibrefrec_to_pid_flag_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: conflicts = find_conflicts(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map( itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) update_status_final() delete_empty_persons() update_personID_canonical_names()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec,)) if inspire_id: matched_pids = list(get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def merge_dynamic(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is dynamic: it allows aid* tables to be changed while it is still running, hence the claiming faciity for example can stay online during the merge. This comfort however is paid off in term of speed. ''' last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) def get_free_pids(): while True: yield get_new_personid() free_pids = get_free_pids() def try_move_signature(sig, target_pid): """ """ paps = get_signature_info(sig) rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: move_signature(sig, free_pids.next()) else: conflicts = find_conflicts(sig, target_pid) if not conflicts: move_signature(sig, target_pid) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: move_signature(sig, free_pids.next()) else: move_signature(conflicts[0][:3], free_pids.next()) move_signature(sig, target_pid) for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = personid_from_signature(d) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: try_move_signature(sig, pid) update_status_final() delete_empty_persons() update_personID_canonical_names()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx % 200 == 0: task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite( " - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite( " - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) pids_having_rec = set( [int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec, )) if inspire_id: matched_pids = list( get_author_by_external_id(inspire_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def load(self, lname, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load(lname, load_map, load_matrix) update_status_final("Probability matrix loaded.")