Beispiel #1
0
def generate_neighbours(entity_embeds1, entity_list1, entity_embeds2, entity_list2, neighbors_num, threads_num=4):
    ent_frags = task_divide(np.array(entity_list1), threads_num)
    ent_frag_indexes = task_divide(np.array(range(len(entity_list1))), threads_num)
    dic = dict()
    for i in range(len(ent_frags)):
        res = find_neighbours(ent_frags[i], entity_embeds1[ent_frag_indexes[i], :], np.array(entity_list2),
                              entity_embeds2, neighbors_num)
        dic = merge_dic(dic, res)
    return dic
Beispiel #2
0
def generate_neighbours_single_thread(entity_embeds, entity_list, neighbors_num, threads_num):
    ent_frags = task_divide(np.array(entity_list), threads_num)
    ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num)
    results = dict()
    for i in range(len(ent_frags)):
        dic = find_neighbours(ent_frags[i], np.array(entity_list),
                              entity_embeds[ent_frag_indexes[i], :],
                              entity_embeds, neighbors_num)
        results = merge_dic(results, dic)
    return results
Beispiel #3
0
def generate_training_data(kgs: KGs, threshold=1.0):
    kg1_selected_attributes, kg2_selected_attributes, selected_attributes = get_kgs_popular_attributes(
        kgs, threshold)
    entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict,
                                       kgs.kg2.entity_attributes_dict)
    print("entity attribute dict", len(entity_attributes_dict))
    training_data_list = list()
    training_links_dict12 = dict(zip(kgs.train_entities1, kgs.train_entities2))
    training_links_dict21 = dict(zip(kgs.train_entities2, kgs.train_entities1))
    training_links_dict = merge_dic(training_links_dict12,
                                    training_links_dict21)
    for ent, attributes in entity_attributes_dict.items():
        if ent in training_links_dict.keys():
            attributes = attributes | entity_attributes_dict.get(
                training_links_dict.get(ent), set())
        attributes = attributes & selected_attributes
        for attr, context_attr in itertools.combinations(attributes, 2):
            if attr != context_attr:
                training_data_list.append((attr, context_attr))
    print("training data of attribute correlations", len(training_data_list))
    return training_data_list
Beispiel #4
0
def get_ent_embeds_from_attributes(kgs: KGs, attr_embeds, selected_attributes):
    print("get entity embeddings from attributes")
    start = time.time()
    ent_mat = None
    entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict,
                                       kgs.kg2.entity_attributes_dict)
    zero_vec = np.zeros([1, attr_embeds.shape[1]], dtype=np.float32)
    for i in range(kgs.entities_num):
        attr_vec = zero_vec
        attributes = entity_attributes_dict.get(i, set())
        attributes = attributes & selected_attributes
        if len(attributes) > 0:
            attr_vecs = attr_embeds[list(attributes), ]
            attr_vec = np.mean(attr_vecs, axis=0, keepdims=True)
        if ent_mat is None:
            ent_mat = attr_vec
        else:
            ent_mat = np.row_stack([ent_mat, attr_vec])
    print('cost time: {:.4f}s'.format(time.time() - start))
    return preprocessing.normalize(ent_mat)
Beispiel #5
0
def generate_neighbours(entity_embeds, entity_list, neighbors_num, threads_num):
    ent_frags = task_divide(np.array(entity_list), threads_num)
    ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num)

    pool = multiprocessing.Pool(processes=len(ent_frags))
    results = list()
    for i in range(len(ent_frags)):
        results.append(pool.apply_async(find_neighbours,
                                        args=(ent_frags[i], np.array(entity_list),
                                              entity_embeds[ent_frag_indexes[i], :],
                                              entity_embeds, neighbors_num)))
    pool.close()
    pool.join()

    dic = dict()
    for res in results:
        dic = merge_dic(dic, res.get())

    del results
    gc.collect()
    return dic
Beispiel #6
0
def load_attr(ent_num, kgs):
    cnt = {}
    entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict, kgs.kg2.entity_attributes_dict)
    for _, vs in entity_attributes_dict.items():
        for v in vs:
            if v not in cnt:
                cnt[v] = 1
            else:
                cnt[v] += 1
    fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)]
    print(fre)
    attr2id = {}
    num = int(0.7 * len(cnt))
    for i in range(num):
        attr2id[fre[i][0]] = i
    attr = np.zeros((ent_num, num), dtype=np.float32)
    for ent, vs in entity_attributes_dict.items():
        for v in vs:
            if v in attr2id:
                attr[ent][attr2id[v]] = 1.0
    return attr
Beispiel #7
0
def stable_alignment(embed1,
                     embed2,
                     metric,
                     normalize,
                     csls_k,
                     nums_threads,
                     cut=100,
                     sim_mat=None):
    t = time.time()
    if sim_mat is None:
        sim_mat = sim(embed1,
                      embed2,
                      metric=metric,
                      normalize=normalize,
                      csls_k=csls_k)

    kg1_candidates, kg2_candidates = dict(), dict()

    num = sim_mat.shape[0]
    x_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(x_tasks))
    rests = list()
    total = 0
    for task in x_tasks:
        total += len(task)
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_')))
    assert total == num
    pool.close()
    pool.join()
    for rest in rests:
        kg1_candidates = merge_dic(kg1_candidates, rest.get())

    sim_mat = sim_mat.T
    num = sim_mat.shape[0]
    y_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(y_tasks))
    rests = list()
    for task in y_tasks:
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_')))
    pool.close()
    pool.join()
    for rest in rests:
        kg2_candidates = merge_dic(kg2_candidates, rest.get())

    # print("kg1_candidates", len(kg1_candidates))
    # print("kg2_candidates", len(kg2_candidates))

    print(
        "generating candidate lists costs time {:.3f} s ".format(time.time() -
                                                                 t))
    t = time.time()
    matching = galeshapley(kg1_candidates, kg2_candidates, cut)
    n = 0
    for i, j in matching.items():
        if int(i.split('_')[-1]) == int(j.split('_')[-1]):
            n += 1
    cost = time.time() - t
    print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format(
        n / len(matching) * 100, cost))