def retrieve_topk_alignment(kg1_source_ents, kg1_embeddings, kg2_candidates, kg2_embeddings, session, k=1, metric='inner', normalize=False, csls_k=0, output_path=None): def search_nearest_k(sim_mat, k): assert k > 0 neighbors = list() num = sim_mat.shape[0] for i in range(num): rank = np.argpartition(-sim_mat[i, :], k) pairs = [j for j in itertools.product([i], rank[0:k])] neighbors.extend(pairs) assert len(neighbors) == num * k return neighbors def triple_writer(triples, output_path, separator="\t", linebreak="\n"): file = open(output_path, 'w', encoding='utf8') for s, p, o in triples: file.write( str(s) + separator + str(p) + separator + str(o) + linebreak) file.close() print(output_path, "saved") embeds1 = tf.nn.embedding_lookup(kg1_embeddings, kg1_source_ents).eval(session=session) embeds2 = tf.nn.embedding_lookup(kg2_embeddings, kg2_candidates).eval(session=session) sim_mat = sim(embeds1, embeds2, metric=metric, normalize=normalize, csls_k=csls_k) topk_neighbors = search_nearest_k(sim_mat, k) topk_neighbors_w_sim = [(kg1_source_ents[i], kg2_candidates[j], sim_mat[i, j]) for i, j in topk_neighbors] if output_path is not None: triple_writer(topk_neighbors_w_sim, output_path) return topk_neighbors_w_sim
def stable_alignment(embed1, embed2, metric, normalize, csls_k, nums_threads, cut=100, sim_mat=None): t = time.time() if sim_mat is None: sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) kg1_candidates, kg2_candidates = dict(), dict() num = sim_mat.shape[0] x_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(x_tasks)) rests = list() total = 0 for task in x_tasks: total += len(task) mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_'))) assert total == num pool.close() pool.join() for rest in rests: kg1_candidates = merge_dic(kg1_candidates, rest.get()) sim_mat = sim_mat.T num = sim_mat.shape[0] y_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(y_tasks)) rests = list() for task in y_tasks: mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_'))) pool.close() pool.join() for rest in rests: kg2_candidates = merge_dic(kg2_candidates, rest.get()) # print("kg1_candidates", len(kg1_candidates)) # print("kg2_candidates", len(kg2_candidates)) print( "generating candidate lists costs time {:.3f} s ".format(time.time() - t)) t = time.time() matching = galeshapley(kg1_candidates, kg2_candidates, cut) n = 0 for i, j in matching.items(): if int(i.split('_')[-1]) == int(j.split('_')[-1]): n += 1 cost = time.time() - t print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format( n / len(matching) * 100, cost))
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize, csls_k, accurate): """ Search alignment with greedy strategy. Parameters ---------- embed1 : matrix_like An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension. embed2 : matrix_like An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension. top_k : list of integers Hits@k metrics for evaluating results. nums_threads : int The number of threads used to search alignment. metric : string The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'. normalize : bool, true or false. Whether to normalize the input embeddings. csls_k : int K value for csls. If k > 0, enhance the similarity by csls. Returns ------- alignment_rest : list, pairs of aligned entities hits1 : float, hits@1 values for alignment results mr : float, MR values for alignment results mrr : float, MRR values for alignment results """ t = time.time() sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) num = sim_mat.shape[0] if nums_threads > 1: hits = [0] * len(top_k) mr, mrr = 0, 0 alignment_rest = set() rests = list() search_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(search_tasks)) for task in search_tasks: mat = sim_mat[task, :] rests.append( pool.apply_async(calculate_rank, (task, mat, top_k, accurate, num))) pool.close() pool.join() for rest in rests: sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get() mr += sub_mr mrr += sub_mrr hits += np.array(sub_hits) alignment_rest |= sub_hits1_rest else: mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)), sim_mat, top_k, accurate, num) assert len(alignment_rest) == num hits = np.array(hits) / num * 100 for i in range(len(hits)): hits[i] = round(hits[i], 3) cost = time.time() - t if accurate: if csls_k > 0: print( "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(csls_k, top_k, hits, mr, mrr, cost)) else: print( "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(top_k, hits, mr, mrr, cost)) else: if csls_k > 0: print( "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s " .format(csls_k, top_k, hits, cost)) else: print("quick results: hits@{} = {}%, time = {:.3f} s ".format( top_k, hits, cost)) hits1 = hits[0] del sim_mat gc.collect() return alignment_rest, hits1, mr, mrr