Ejemplo n.º 1
0
def sim_multi_blocks(embeds1, embeds2, blocks_num=16):
    num = embeds1.shape[0]
    idx_list = task_divide(np.array(range(num)), blocks_num)
    sim_list = []
    for idx in idx_list:
        res = np.matmul(embeds1[idx, :], embeds2.T)
        sim_list.append(res)
    sim_mat = np.concatenate(sim_list, axis=0)
    return sim_mat
Ejemplo n.º 2
0
def sim_multi_threads(embeds1, embeds2, threads_num=16):
    num = embeds1.shape[0]
    idx_list = task_divide(np.array(range(num)), threads_num)
    pool = multiprocessing.Pool(processes=len(idx_list))
    rests = list()
    for idx in idx_list:
        rests.append(pool.apply_async(np.dot, (embeds1[idx, :], embeds2.T)))
    sim_list = []
    for res in rests:
        sim_list.append(res.get())
    sim_mat = np.concatenate(sim_list, axis=0)
    return sim_mat
Ejemplo n.º 3
0
def generate_neighbours(entity_embeds, entity_list, neighbors_num, threads_num):
    entity_list = np.array(entity_list)
    ent_frags = task_divide(entity_list, threads_num)
    ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num)

    pool = multiprocessing.Pool(processes=len(ent_frags))
    results = list()
    for i in range(len(ent_frags)):
        results.append(pool.apply_async(find_neighbours,
                                        args=(ent_frags[i], entity_list,
                                              entity_embeds[ent_frag_indexes[i], :],
                                              entity_embeds, neighbors_num)))
    pool.close()
    pool.join()

    dic = dict()
    for res in results:
        dic = merge_dic(dic, res.get())

    del results
    gc.collect()
    return dic
Ejemplo n.º 4
0
def csls_sim_multi_threads(sim_mat, k, nums_threads):
    tasks = task_divide(np.array(range(sim_mat.shape[0])), nums_threads)
    pool = multiprocessing.Pool(processes=len(tasks))
    rests = list()
    for task in tasks:
        rests.append(
            pool.apply_async(calculate_nearest_k, (sim_mat[task, :], k)))
    pool.close()
    pool.join()
    sim_values = None
    for res in rests:
        val = res.get()
        if sim_values is None:
            sim_values = val
        else:
            sim_values = np.append(sim_values, val)
    assert sim_values.shape[0] == sim_mat.shape[0]
    return sim_values
    def run(self):
        t = time.time()
        relation_triples_num = self.kgs.kg1.local_relation_triples_num + self.kgs.kg2.local_relation_triples_num
        attribute_triples_num = self.kgs.kg1.local_attribute_triples_num + self.kgs.kg2.local_attribute_triples_num
        relation_triple_steps = int(math.ceil(relation_triples_num / self.args.batch_size))
        attribute_triple_steps = int(math.ceil(attribute_triples_num / self.args.batch_size))
        relation_step_tasks = task_divide(list(range(relation_triple_steps)), self.args.batch_threads_num)
        attribute_step_tasks = task_divide(list(range(attribute_triple_steps)), self.args.batch_threads_num)
        manager = mp.Manager()
        relation_batch_queue = manager.Queue()
        attribute_batch_queue = manager.Queue()
        cross_kg_relation_triples = self.kgs.kg1.sup_relation_triples_list + self.kgs.kg2.sup_relation_triples_list
        cross_kg_entity_inference_in_attribute_triples = self.kgs.kg1.sup_attribute_triples_list + \
                                                         self.kgs.kg2.sup_attribute_triples_list
        cross_kg_relation_inference = self.predicate_align_model.sup_relation_alignment_triples1 + \
                                      self.predicate_align_model.sup_relation_alignment_triples2
        cross_kg_attribute_inference = self.predicate_align_model.sup_attribute_alignment_triples1 + \
                                       self.predicate_align_model.sup_attribute_alignment_triples2
        neighbors1, neighbors2 = None, None

        entity_list = self.kgs.kg1.entities_list + self.kgs.kg2.entities_list

        valid(self, embed_choice='nv')
        valid(self, embed_choice='avg')
        for i in range(1, self.args.max_epoch + 1):
            print('epoch {}:'.format(i))
            self.train_relation_view_1epo(i, relation_triple_steps, relation_step_tasks,
                                          relation_batch_queue, neighbors1, neighbors2)
            self.train_cross_kg_entity_inference_relation_view_1epo(i, cross_kg_relation_triples)
            if i > self.args.start_predicate_soft_alignment:
                self.train_cross_kg_relation_inference_1epo(i, cross_kg_relation_inference)

            self.train_attribute_view_1epo(i, attribute_triple_steps, attribute_step_tasks, attribute_batch_queue,
                                           neighbors1, neighbors2)
            self.train_cross_kg_entity_inference_attribute_view_1epo(i, cross_kg_entity_inference_in_attribute_triples)
            if i > self.args.start_predicate_soft_alignment:
                self.train_cross_kg_attribute_inference_1epo(i, cross_kg_attribute_inference)

            if i >= self.args.start_valid and i % self.args.eval_freq == 0:
                valid(self, embed_choice='rv')
                valid(self, embed_choice='av')
                valid(self, embed_choice='avg')
                flag = valid_WVA(self)
                self.flag1, self.flag2, self.early_stop = eva.early_stop(self.flag1, self.flag2, flag)

                if self.early_stop or i == self.args.max_epoch:
                    break

                if i >= self.args.start_predicate_soft_alignment:
                    self.predicate_align_model.update_predicate_alignment(self.rel_embeds.eval(session=self.session))
                    self.predicate_align_model.update_predicate_alignment(self.attr_embeds.eval(session=self.session),
                                                                          predicate_type='attribute')
                    cross_kg_relation_inference = self.predicate_align_model.sup_relation_alignment_triples1 + \
                                                  self.predicate_align_model.sup_relation_alignment_triples2
                    cross_kg_attribute_inference = self.predicate_align_model.sup_attribute_alignment_triples1 + \
                                                   self.predicate_align_model.sup_attribute_alignment_triples2

            if self.args.neg_sampling == 'truncated' and i % self.args.truncated_freq == 0:
                t1 = time.time()
                assert 0.0 < self.args.truncated_epsilon < 1.0
                neighbors_num1 = int((1 - self.args.truncated_epsilon) * self.kgs.kg1.entities_num)
                neighbors_num2 = int((1 - self.args.truncated_epsilon) * self.kgs.kg2.entities_num)
                neighbors1 = bat.generate_neighbours(self.eval_kg1_useful_ent_embeddings(),
                                                     self.kgs.useful_entities_list1,
                                                     neighbors_num1, self.args.batch_threads_num)
                neighbors2 = bat.generate_neighbours(self.eval_kg2_useful_ent_embeddings(),
                                                     self.kgs.useful_entities_list2,
                                                     neighbors_num2, self.args.batch_threads_num)
                ent_num = len(self.kgs.kg1.entities_list) + len(self.kgs.kg2.entities_list)
                print('neighbor dict:', len(neighbors1), type(neighbors2))
                print("generating neighbors of {} entities costs {:.3f} s.".format(ent_num, time.time() - t1))
        for i in range(1, self.args.shared_learning_max_epoch + 1):
            self.train_shared_space_mapping_1epo(i, entity_list)
            if i >= self.args.start_valid and i % self.args.eval_freq == 0:
                valid(self, embed_choice='final')
        self.save()
        test(self, embed_choice='nv')
        test(self, embed_choice='rv')
        test(self, embed_choice='av')
        test(self, embed_choice='avg')
        test_WVA(self)
        test(self, embed_choice='final')
Ejemplo n.º 6
0
def stable_alignment(embed1,
                     embed2,
                     metric,
                     normalize,
                     csls_k,
                     nums_threads,
                     cut=100,
                     sim_mat=None):
    t = time.time()
    if sim_mat is None:
        sim_mat = sim(embed1,
                      embed2,
                      metric=metric,
                      normalize=normalize,
                      csls_k=csls_k)

    kg1_candidates, kg2_candidates = dict(), dict()

    num = sim_mat.shape[0]
    x_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(x_tasks))
    rests = list()
    total = 0
    for task in x_tasks:
        total += len(task)
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_')))
    assert total == num
    pool.close()
    pool.join()
    for rest in rests:
        kg1_candidates = merge_dic(kg1_candidates, rest.get())

    sim_mat = sim_mat.T
    num = sim_mat.shape[0]
    y_tasks = task_divide(np.array(range(num)), nums_threads)
    pool = multiprocessing.Pool(processes=len(y_tasks))
    rests = list()
    for task in y_tasks:
        mat = sim_mat[task, :]
        rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_')))
    pool.close()
    pool.join()
    for rest in rests:
        kg2_candidates = merge_dic(kg2_candidates, rest.get())

    # print("kg1_candidates", len(kg1_candidates))
    # print("kg2_candidates", len(kg2_candidates))

    print(
        "generating candidate lists costs time {:.3f} s ".format(time.time() -
                                                                 t))
    t = time.time()
    matching = galeshapley(kg1_candidates, kg2_candidates, cut)
    n = 0
    for i, j in matching.items():
        if int(i.split('_')[-1]) == int(j.split('_')[-1]):
            n += 1
    cost = time.time() - t
    print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format(
        n / len(matching) * 100, cost))
Ejemplo n.º 7
0
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize,
                     csls_k, accurate):
    """
    Search alignment with greedy strategy.

    Parameters
    ----------
    embed1 : matrix_like
        An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
    embed2 : matrix_like
        An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
    top_k : list of integers
        Hits@k metrics for evaluating results.
    nums_threads : int
        The number of threads used to search alignment.
    metric : string
        The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'.
    normalize : bool, true or false.
        Whether to normalize the input embeddings.
    csls_k : int
        K value for csls. If k > 0, enhance the similarity by csls.

    Returns
    -------
    alignment_rest :  list, pairs of aligned entities
    hits1 : float, hits@1 values for alignment results
    mr : float, MR values for alignment results
    mrr : float, MRR values for alignment results
    """
    t = time.time()
    sim_mat = sim(embed1,
                  embed2,
                  metric=metric,
                  normalize=normalize,
                  csls_k=csls_k)
    num = sim_mat.shape[0]
    if nums_threads > 1:
        hits = [0] * len(top_k)
        mr, mrr = 0, 0
        alignment_rest = set()
        rests = list()
        search_tasks = task_divide(np.array(range(num)), nums_threads)
        pool = multiprocessing.Pool(processes=len(search_tasks))
        for task in search_tasks:
            mat = sim_mat[task, :]
            rests.append(
                pool.apply_async(calculate_rank,
                                 (task, mat, top_k, accurate, num)))
        pool.close()
        pool.join()
        for rest in rests:
            sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get()
            mr += sub_mr
            mrr += sub_mrr
            hits += np.array(sub_hits)
            alignment_rest |= sub_hits1_rest
    else:
        mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)),
                                                       sim_mat, top_k,
                                                       accurate, num)
    assert len(alignment_rest) == num
    hits = np.array(hits) / num * 100
    for i in range(len(hits)):
        hits[i] = round(hits[i], 3)
    cost = time.time() - t
    if accurate:
        if csls_k > 0:
            print(
                "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(csls_k, top_k, hits, mr, mrr, cost))
        else:
            print(
                "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s "
                .format(top_k, hits, mr, mrr, cost))
    else:
        if csls_k > 0:
            print(
                "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s "
                .format(csls_k, top_k, hits, cost))
        else:
            print("quick results: hits@{} = {}%, time = {:.3f} s ".format(
                top_k, hits, cost))
    hits1 = hits[0]
    del sim_mat
    gc.collect()
    return alignment_rest, hits1, mr, mrr