Esempio n. 1
0
def retrieve_topk_alignment(kg1_source_ents,
                            kg1_embeddings,
                            kg2_candidates,
                            kg2_embeddings,
                            session,
                            k=1,
                            metric='inner',
                            normalize=False,
                            csls_k=0,
                            output_path=None):
    def search_nearest_k(sim_mat, k):
        assert k > 0
        neighbors = list()
        num = sim_mat.shape[0]
        for i in range(num):
            rank = np.argpartition(-sim_mat[i, :], k)
            pairs = [j for j in itertools.product([i], rank[0:k])]
            neighbors.extend(pairs)
        assert len(neighbors) == num * k
        return neighbors

    def triple_writer(triples, output_path, separator="\t", linebreak="\n"):
        file = open(output_path, 'w', encoding='utf8')
        for s, p, o in triples:
            file.write(
                str(s) + separator + str(p) + separator + str(o) + linebreak)
        file.close()
        print(output_path, "saved")

    embeds1 = tf.nn.embedding_lookup(kg1_embeddings,
                                     kg1_source_ents).eval(session=session)
    embeds2 = tf.nn.embedding_lookup(kg2_embeddings,
                                     kg2_candidates).eval(session=session)
    sim_mat = sim(embeds1,
                  embeds2,
                  metric=metric,
                  normalize=normalize,
                  csls_k=csls_k)
    topk_neighbors = search_nearest_k(sim_mat, k)
    topk_neighbors_w_sim = [(kg1_source_ents[i], kg2_candidates[j], sim_mat[i,
                                                                            j])
                            for i, j in topk_neighbors]

    if output_path is not None:
        triple_writer(topk_neighbors_w_sim, output_path)

    return topk_neighbors_w_sim
Esempio n. 2
0
def stable_alignment(embed1,
                     embed2,
                     metric,
                     normalize,
                     csls_k,
                     nums_threads,
                     cut=100,
                     sim_mat=None):
    t = time.time()
    if sim_mat is None:
        sim_mat = sim(embed1,
                      embed2,
                      metric=metric,
                      normalize=normalize,
                      csls_k=csls_k)

    kg1_candidates, kg2_candidates = dict(), dict()

    num = sim_mat.shape[0]
    x_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(x_tasks))
    rests = list()
    total = 0
    for task in x_tasks:
        total += len(task)
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_')))
    assert total == num
    pool.close()
    pool.join()
    for rest in rests:
        kg1_candidates = merge_dic(kg1_candidates, rest.get())

    sim_mat = sim_mat.T
    num = sim_mat.shape[0]
    y_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(y_tasks))
    rests = list()
    for task in y_tasks:
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_')))
    pool.close()
    pool.join()
    for rest in rests:
        kg2_candidates = merge_dic(kg2_candidates, rest.get())

    # print("kg1_candidates", len(kg1_candidates))
    # print("kg2_candidates", len(kg2_candidates))

    print(
        "generating candidate lists costs time {:.3f} s ".format(time.time() -
                                                                 t))
    t = time.time()
    matching = galeshapley(kg1_candidates, kg2_candidates, cut)
    n = 0
    for i, j in matching.items():
        if int(i.split('_')[-1]) == int(j.split('_')[-1]):
            n += 1
    cost = time.time() - t
    print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format(
        n / len(matching) * 100, cost))
Esempio n. 3
0
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize,
                     csls_k, accurate):
    """
    Search alignment with greedy strategy.

    Parameters
    ----------
    embed1 : matrix_like
        An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
    embed2 : matrix_like
        An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
    top_k : list of integers
        Hits@k metrics for evaluating results.
    nums_threads : int
        The number of threads used to search alignment.
    metric : string
        The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'.
    normalize : bool, true or false.
        Whether to normalize the input embeddings.
    csls_k : int
        K value for csls. If k > 0, enhance the similarity by csls.

    Returns
    -------
    alignment_rest :  list, pairs of aligned entities
    hits1 : float, hits@1 values for alignment results
    mr : float, MR values for alignment results
    mrr : float, MRR values for alignment results
    """
    t = time.time()
    sim_mat = sim(embed1,
                  embed2,
                  metric=metric,
                  normalize=normalize,
                  csls_k=csls_k)
    num = sim_mat.shape[0]
    if nums_threads > 1:
        hits = [0] * len(top_k)
        mr, mrr = 0, 0
        alignment_rest = set()
        rests = list()
        search_tasks = task_divide(np.array(range(num)), nums_threads)
        pool = multiprocessing.Pool(processes=len(search_tasks))
        for task in search_tasks:
            mat = sim_mat[task, :]
            rests.append(
                pool.apply_async(calculate_rank,
                                 (task, mat, top_k, accurate, num)))
        pool.close()
        pool.join()
        for rest in rests:
            sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get()
            mr += sub_mr
            mrr += sub_mrr
            hits += np.array(sub_hits)
            alignment_rest |= sub_hits1_rest
    else:
        mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)),
                                                       sim_mat, top_k,
                                                       accurate, num)
    assert len(alignment_rest) == num
    hits = np.array(hits) / num * 100
    for i in range(len(hits)):
        hits[i] = round(hits[i], 3)
    cost = time.time() - t
    if accurate:
        if csls_k > 0:
            print(
                "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(csls_k, top_k, hits, mr, mrr, cost))
        else:
            print(
                "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(top_k, hits, mr, mrr, cost))
    else:
        if csls_k > 0:
            print(
                "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s "
                .format(csls_k, top_k, hits, cost))
        else:
            print("quick results: hits@{} = {}%, time = {:.3f} s ".format(
                top_k, hits, cost))
    hits1 = hits[0]
    del sim_mat
    gc.collect()
    return alignment_rest, hits1, mr, mrr