def generate_neighbours(entity_embeds1, entity_list1, entity_embeds2, entity_list2, neighbors_num, threads_num=4): ent_frags = task_divide(np.array(entity_list1), threads_num) ent_frag_indexes = task_divide(np.array(range(len(entity_list1))), threads_num) dic = dict() for i in range(len(ent_frags)): res = find_neighbours(ent_frags[i], entity_embeds1[ent_frag_indexes[i], :], np.array(entity_list2), entity_embeds2, neighbors_num) dic = merge_dic(dic, res) return dic
def generate_neighbours_single_thread(entity_embeds, entity_list, neighbors_num, threads_num): ent_frags = task_divide(np.array(entity_list), threads_num) ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num) results = dict() for i in range(len(ent_frags)): dic = find_neighbours(ent_frags[i], np.array(entity_list), entity_embeds[ent_frag_indexes[i], :], entity_embeds, neighbors_num) results = merge_dic(results, dic) return results
def generate_training_data(kgs: KGs, threshold=1.0): kg1_selected_attributes, kg2_selected_attributes, selected_attributes = get_kgs_popular_attributes( kgs, threshold) entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict, kgs.kg2.entity_attributes_dict) print("entity attribute dict", len(entity_attributes_dict)) training_data_list = list() training_links_dict12 = dict(zip(kgs.train_entities1, kgs.train_entities2)) training_links_dict21 = dict(zip(kgs.train_entities2, kgs.train_entities1)) training_links_dict = merge_dic(training_links_dict12, training_links_dict21) for ent, attributes in entity_attributes_dict.items(): if ent in training_links_dict.keys(): attributes = attributes | entity_attributes_dict.get( training_links_dict.get(ent), set()) attributes = attributes & selected_attributes for attr, context_attr in itertools.combinations(attributes, 2): if attr != context_attr: training_data_list.append((attr, context_attr)) print("training data of attribute correlations", len(training_data_list)) return training_data_list
def get_ent_embeds_from_attributes(kgs: KGs, attr_embeds, selected_attributes): print("get entity embeddings from attributes") start = time.time() ent_mat = None entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict, kgs.kg2.entity_attributes_dict) zero_vec = np.zeros([1, attr_embeds.shape[1]], dtype=np.float32) for i in range(kgs.entities_num): attr_vec = zero_vec attributes = entity_attributes_dict.get(i, set()) attributes = attributes & selected_attributes if len(attributes) > 0: attr_vecs = attr_embeds[list(attributes), ] attr_vec = np.mean(attr_vecs, axis=0, keepdims=True) if ent_mat is None: ent_mat = attr_vec else: ent_mat = np.row_stack([ent_mat, attr_vec]) print('cost time: {:.4f}s'.format(time.time() - start)) return preprocessing.normalize(ent_mat)
def generate_neighbours(entity_embeds, entity_list, neighbors_num, threads_num): ent_frags = task_divide(np.array(entity_list), threads_num) ent_frag_indexes = task_divide(np.array(range(len(entity_list))), threads_num) pool = multiprocessing.Pool(processes=len(ent_frags)) results = list() for i in range(len(ent_frags)): results.append(pool.apply_async(find_neighbours, args=(ent_frags[i], np.array(entity_list), entity_embeds[ent_frag_indexes[i], :], entity_embeds, neighbors_num))) pool.close() pool.join() dic = dict() for res in results: dic = merge_dic(dic, res.get()) del results gc.collect() return dic
def load_attr(ent_num, kgs): cnt = {} entity_attributes_dict = merge_dic(kgs.kg1.entity_attributes_dict, kgs.kg2.entity_attributes_dict) for _, vs in entity_attributes_dict.items(): for v in vs: if v not in cnt: cnt[v] = 1 else: cnt[v] += 1 fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)] print(fre) attr2id = {} num = int(0.7 * len(cnt)) for i in range(num): attr2id[fre[i][0]] = i attr = np.zeros((ent_num, num), dtype=np.float32) for ent, vs in entity_attributes_dict.items(): for v in vs: if v in attr2id: attr[ent][attr2id[v]] = 1.0 return attr
def stable_alignment(embed1, embed2, metric, normalize, csls_k, nums_threads, cut=100, sim_mat=None): t = time.time() if sim_mat is None: sim_mat = sim(embed1, embed2, metric=metric, normalize=normalize, csls_k=csls_k) kg1_candidates, kg2_candidates = dict(), dict() num = sim_mat.shape[0] x_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(x_tasks)) rests = list() total = 0 for task in x_tasks: total += len(task) mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'x_', 'y_'))) assert total == num pool.close() pool.join() for rest in rests: kg1_candidates = merge_dic(kg1_candidates, rest.get()) sim_mat = sim_mat.T num = sim_mat.shape[0] y_tasks = task_divide(np.array(range(num)), nums_threads) pool = multiprocessing.Pool(processes=len(y_tasks)) rests = list() for task in y_tasks: mat = sim_mat[task, :] rests.append(pool.apply_async(arg_sort, (task, mat, 'y_', 'x_'))) pool.close() pool.join() for rest in rests: kg2_candidates = merge_dic(kg2_candidates, rest.get()) # print("kg1_candidates", len(kg1_candidates)) # print("kg2_candidates", len(kg2_candidates)) print( "generating candidate lists costs time {:.3f} s ".format(time.time() - t)) t = time.time() matching = galeshapley(kg1_candidates, kg2_candidates, cut) n = 0 for i, j in matching.items(): if int(i.split('_')[-1]) == int(j.split('_')[-1]): n += 1 cost = time.time() - t print("stable alignment precision = {:.3f}%, time = {:.3f} s ".format( n / len(matching) * 100, cost))