def sim_multi_blocks(embeds1, embeds2, blocks_num=16): num = embeds1.shape[0] idx_list = task_divide(np.array(range(num)), blocks_num) sim_list = [] for idx in idx_list: res = np.matmul(embeds1[idx, :], embeds2.T) sim_list.append(res) sim_mat = np.concatenate(sim_list, axis=0) return sim_mat
def sim_multi_threads(embeds1, embeds2, threads_num=16): num = embeds1.shape[0] idx_list = task_divide(np.array(range(num)), threads_num) pool = multiprocessing.Pool(processes=len(idx_list)) rests = list() for idx in idx_list: rests.append(pool.apply_async(np.dot, (embeds1[idx, :], embeds2.T))) sim_list = [] for res in rests: sim_list.append(res.get()) sim_mat = np.concatenate(sim_list, axis=0) return sim_mat
def generate_neighbours(entity_embeds, entity_list, neighbors_num, threads_num): entity_list = np.array(entity_list) ent_frags = task_divide(entity_list, threads_num) ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num) pool = multiprocessing.Pool(processes=len(ent_frags)) results = list() for i in range(len(ent_frags)): results.append(pool.apply_async(find_neighbours, args=(ent_frags[i], entity_list, entity_embeds[ent_frag_indexes[i], :], entity_embeds, neighbors_num))) pool.close() pool.join() dic = dict() for res in results: dic = merge_dic(dic, res.get()) del results gc.collect() return dic
def csls_sim_multi_threads(sim_mat, k, nums_threads): tasks = task_divide(np.array(range(sim_mat.shape[0])), nums_threads) pool = multiprocessing.Pool(processes=len(tasks)) rests = list() for task in tasks: rests.append( pool.apply_async(calculate_nearest_k, (sim_mat[task, :], k))) pool.close() pool.join() sim_values = None for res in rests: val = res.get() if sim_values is None: sim_values = val else: sim_values = np.append(sim_values, val) assert sim_values.shape[0] == sim_mat.shape[0] return sim_values
def run(self): t = time.time() relation_triples_num = self.kgs.kg1.local_relation_triples_num + self.kgs.kg2.local_relation_triples_num attribute_triples_num = self.kgs.kg1.local_attribute_triples_num + self.kgs.kg2.local_attribute_triples_num relation_triple_steps = int(math.ceil(relation_triples_num / self.args.batch_size)) attribute_triple_steps = int(math.ceil(attribute_triples_num / self.args.batch_size)) relation_step_tasks = task_divide(list(range(relation_triple_steps)), self.args.batch_threads_num) attribute_step_tasks = task_divide(list(range(attribute_triple_steps)), self.args.batch_threads_num) manager = mp.Manager() relation_batch_queue = manager.Queue() attribute_batch_queue = manager.Queue() cross_kg_relation_triples = self.kgs.kg1.sup_relation_triples_list + self.kgs.kg2.sup_relation_triples_list cross_kg_entity_inference_in_attribute_triples = self.kgs.kg1.sup_attribute_triples_list + \ self.kgs.kg2.sup_attribute_triples_list cross_kg_relation_inference = self.predicate_align_model.sup_relation_alignment_triples1 + \ self.predicate_align_model.sup_relation_alignment_triples2 cross_kg_attribute_inference = self.predicate_align_model.sup_attribute_alignment_triples1 + \ self.predicate_align_model.sup_attribute_alignment_triples2 neighbors1, neighbors2 = None, None entity_list = self.kgs.kg1.entities_list + self.kgs.kg2.entities_list valid(self, embed_choice='nv') valid(self, embed_choice='avg') for i in range(1, self.args.max_epoch + 1): print('epoch {}:'.format(i)) self.train_relation_view_1epo(i, relation_triple_steps, relation_step_tasks, relation_batch_queue, neighbors1, neighbors2) self.train_cross_kg_entity_inference_relation_view_1epo(i, cross_kg_relation_triples) if i > self.args.start_predicate_soft_alignment: self.train_cross_kg_relation_inference_1epo(i, cross_kg_relation_inference) self.train_attribute_view_1epo(i, attribute_triple_steps, attribute_step_tasks, attribute_batch_queue, neighbors1, neighbors2) self.train_cross_kg_entity_inference_attribute_view_1epo(i, cross_kg_entity_inference_in_attribute_triples) if i > self.args.start_predicate_soft_alignment: self.train_cross_kg_attribute_inference_1epo(i, cross_kg_attribute_inference) if i >= self.args.start_valid and i % self.args.eval_freq == 0: valid(self, embed_choice='rv') valid(self, embed_choice='av') valid(self, embed_choice='avg') flag = valid_WVA(self) self.flag1, self.flag2, self.early_stop = eva.early_stop(self.flag1, self.flag2, flag) if self.early_stop or i == self.args.max_epoch: break if i >= self.args.start_predicate_soft_alignment: self.predicate_align_model.update_predicate_alignment(self.rel_embeds.eval(session=self.session)) self.predicate_align_model.update_predicate_alignment(self.attr_embeds.eval(session=self.session), predicate_type='attribute') cross_kg_relation_inference = self.predicate_align_model.sup_relation_alignment_triples1 + \ self.predicate_align_model.sup_relation_alignment_triples2 cross_kg_attribute_inference = self.predicate_align_model.sup_attribute_alignment_triples1 + \ self.predicate_align_model.sup_attribute_alignment_triples2 if self.args.neg_sampling == 'truncated' and i % self.args.truncated_freq == 0: t1 = time.time() assert 0.0 < self.args.truncated_epsilon < 1.0 neighbors_num1 = int((1 - self.args.truncated_epsilon) * self.kgs.kg1.entities_num) neighbors_num2 = int((1 - self.args.truncated_epsilon) * self.kgs.kg2.entities_num) neighbors1 = bat.generate_neighbours(self.eval_kg1_useful_ent_embeddings(), self.kgs.useful_entities_list1, neighbors_num1, self.args.batch_threads_num) neighbors2 = bat.generate_neighbours(self.eval_kg2_useful_ent_embeddings(), self.kgs.useful_entities_list2, neighbors_num2, self.args.batch_threads_num) ent_num = len(self.kgs.kg1.entities_list) + len(self.kgs.kg2.entities_list) print('neighbor dict:', len(neighbors1), type(neighbors2)) print("generating neighbors of {} entities costs {:.3f} s.".format(ent_num, time.time() - t1)) for i in range(1, self.args.shared_learning_max_epoch + 1): self.train_shared_space_mapping_1epo(i, entity_list) if i >= self.args.start_valid and i % self.args.eval_freq == 0: valid(self, embed_choice='final') self.save() test(self, embed_choice='nv') test(self, embed_choice='rv') test(self, embed_choice='av') test(self, embed_choice='avg') test_WVA(self) test(self, embed_choice='final')
def stable_alignment(embed1, embed2, metric, normalize, csls_k, nums_threads, cut=100, sim_mat=None): t = time.time() if sim_mat is None: sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) kg1_candidates, kg2_candidates = dict(), dict() num = sim_mat.shape[0] x_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(x_tasks)) rests = list() total = 0 for task in x_tasks: total += len(task) mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_'))) assert total == num pool.close() pool.join() for rest in rests: kg1_candidates = merge_dic(kg1_candidates, rest.get()) sim_mat = sim_mat.T num = sim_mat.shape[0] y_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(y_tasks)) rests = list() for task in y_tasks: mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_'))) pool.close() pool.join() for rest in rests: kg2_candidates = merge_dic(kg2_candidates, rest.get()) # print("kg1_candidates", len(kg1_candidates)) # print("kg2_candidates", len(kg2_candidates)) print( "generating candidate lists costs time {:.3f} s ".format(time.time() - t)) t = time.time() matching = galeshapley(kg1_candidates, kg2_candidates, cut) n = 0 for i, j in matching.items(): if int(i.split('_')[-1]) == int(j.split('_')[-1]): n += 1 cost = time.time() - t print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format( n / len(matching) * 100, cost))
def greedy_alignment(embed1, embed2, top_k, nums_threads, metric, normalize, csls_k, accurate): """ Search alignment with greedy strategy. Parameters ---------- embed1 : matrix_like An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension. embed2 : matrix_like An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension. top_k : list of integers Hits@k metrics for evaluating results. nums_threads : int The number of threads used to search alignment. metric : string The distance metric to use. It can be 'cosine', 'euclidean' or 'inner'. normalize : bool, true or false. Whether to normalize the input embeddings. csls_k : int K value for csls. If k > 0, enhance the similarity by csls. Returns ------- alignment_rest : list, pairs of aligned entities hits1 : float, hits@1 values for alignment results mr : float, MR values for alignment results mrr : float, MRR values for alignment results """ t = time.time() sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) num = sim_mat.shape[0] if nums_threads > 1: hits = [0] * len(top_k) mr, mrr = 0, 0 alignment_rest = set() rests = list() search_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(search_tasks)) for task in search_tasks: mat = sim_mat[task, :] rests.append( pool.apply_async(calculate_rank, (task, mat, top_k, accurate, num))) pool.close() pool.join() for rest in rests: sub_mr, sub_mrr, sub_hits, sub_hits1_rest = rest.get() mr += sub_mr mrr += sub_mrr hits += np.array(sub_hits) alignment_rest |= sub_hits1_rest else: mr, mrr, hits, alignment_rest = calculate_rank(list(range(num)), sim_mat, top_k, accurate, num) assert len(alignment_rest) == num hits = np.array(hits) / num * 100 for i in range(len(hits)): hits[i] = round(hits[i], 3) cost = time.time() - t if accurate: if csls_k > 0: print( "accurate results with csls: csls={}, hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(csls_k, top_k, hits, mr, mrr, cost)) else: print( "accurate results: hits@{} = {}%, mr = {:.3f}, mrr = {:.6f}, time = {:.3f} s " .format(top_k, hits, mr, mrr, cost)) else: if csls_k > 0: print( "quick results with csls: csls={}, hits@{} = {}%, time = {:.3f} s " .format(csls_k, top_k, hits, cost)) else: print("quick results: hits@{} = {}%, time = {:.3f} s ".format( top_k, hits, cost)) hits1 = hits[0] del sim_mat gc.collect() return alignment_rest, hits1, mr, mrr