def get_all_rels(file_name, rel_json, rel_neo_csv, entities=None, rel_csv_header=(":START_ID", ":END_ID", ":TYPE", "name"), parse_line=parse_line): lines = read_from_file(file_name) relations = {} idx = 0 rel_data = [] if not entities: entities = get_all_entities(file_name, None, None) for line in tqdm(range(len(lines))): line = lines[line].strip() try: ent1, rel, ent2 = parse_line(line) except: print(line) continue if rel not in relations: incDic(relations, rel, idx) idx += 1 rel_data.append((entities[ent1], entities[ent2], 'Relation', rel)) logger.info("origin relationships: %d" % len(rel_data)) rel_data = list(set(rel_data)) logger.info("duplicated data deleted: %d" % len(rel_data)) write_csv(rel_neo_csv, rel_data, rel_csv_header) dump_json(rel_json, relations) return relations
def get_all_entities(file_name, entity_json, ent_neo_csv, ent_csv_header=("id:ID", "name", ":LABEL"), parse_line=parse_line): lines = read_from_file(file_name) entities = {} idx = 0 csv_data = [] for line in tqdm((range(len(lines)))): line = lines[line].strip() try: ent1, rel, ent2 = parse_line(line) except: print(line.split('\t')) continue if ent1 not in entities: incDic(entities, ent1, idx) idx += 1 csv_data.append((str(idx), ent1, 'Entity')) if ent2 not in entities: incDic(entities, ent2, idx) idx += 1 csv_data.append((str(idx), ent2, 'Entity')) if ent_neo_csv: write_csv(ent_neo_csv, csv_data, ent_csv_header) dump_json(entity_json, entities) return entities
def split_train_valid_test_triplets(file_name, rel_tris_num, save_files, train_valid_test=(0.8, 0.1, 0.1), parse_line=parse_clean_line): lines = read_from_file(file_name) rel_with_tris = {} for line in tqdm(range(len(lines))): line = lines[line].strip() triples = parse_line(line) ent1, rel, ent2 = triples appendDic(rel_with_tris, rel, triples) train_triplets = [] valid_triplets = [] test_triplets = [] rel_tri_nums = {} for rel, triplets in rel_with_tris.items(): num = len(triplets) rel_tri_nums[rel] = num test_num = math.ceil(train_valid_test[2] * num) valid_num = math.ceil(train_valid_test[1] * num) # print(test_num) tests = triplets[:test_num] valids = triplets[test_num:valid_num + test_num] trains = triplets[valid_num + test_num:] train_triplets.extend(trains) valid_triplets.extend(valids) test_triplets.extend(tests) rel_tri_nums = sorted_dict(rel_tri_nums) dump_json(rel_tris_num, rel_tri_nums) write_to_file(save_files[0], get_lines_format(train_triplets)) write_to_file(save_files[1], get_lines_format(valid_triplets)) write_to_file(save_files[2], get_lines_format(test_triplets))
def get_rel_barrel_graph(self, graph, barrel, **kwargs): # self.filtered_rel_graph = GetSeedGraph.get_relation_seed(graph, barrel) # self.filtered_rel_dual_graph, self.filtered_rel_dict2rel = self.graphcls.get_dual_graph_from_tuples(graph, isTrans=False) self.filtered_rel_dual_graph_origin = GetSeedGraph.get_dual_seed(graph, barrel) self.filtered_rel2id, self.filtered_rel_dual_graph = get_rel_dict(self.filtered_rel_dual_graph_origin) self.filtered_rel_dict2rel = transpose_dict(self.filtered_rel2id) save_tuples_to_txt(self.filtered_rel_dual_graph, self.rel_graph) dump_json(self.rel_dual_rel2dict, self.filtered_rel_dict2rel) return set(self.filtered_rel2id.keys())
def get_post_community_with_attr_and_entity(community_json, info_save_json): communities = load_json(community_json) info_dic = {} for key, comm in communities.items(): info_dic[key] = {} info_dic[key]["ent_and_attr_number"] = len(comm) rel_list = [] for item in comm: if item.startswith("_<"): rel_list.append(item[1:]) info_dic[key]["attr_number"] = len(rel_list) info_dic[key]["attr"] = rel_list dump_json(info_save_json, info_dic) return communities, info_dic
def save_related_attr_infos(self, community_graphs, community_related_infos, community_significant_infos, community_graphs_json=None, left_related_infos_json=None, left_significants_json=None, no_attr_comm_json=None, single_ent_comm_json=None): dump_json(community_graphs_json, community_graphs) dump_json(left_related_infos_json, community_related_infos) dump_json(left_significants_json, community_significant_infos) dump_json(no_attr_comm_json, self.no_rel_comm) dump_json(single_ent_comm_json, self.filtered_comm)
def get_type2ents(file_name, json_file, ent_type_maps=None, type_key='<类型>', encoding_format='utf-8'): if ent_type_maps is None: ent_type_maps = get_type_dic(file_name, None, type_key, encoding_format) type2ents_map = {} for ent, types in ent_type_maps.items(): for typ in types: appendDic(type2ents_map, typ, ent) logger.info("number of types: %d" % len(type2ent2_map)) dump_json(json_file, type2ents_map)
def load_graph(self, graph_triples=None, head_triples=None, tail_triples=None, rel_triples=None, **kwargs): if graph_triples is None: self.graphcls = BaseGetDualGraph(Config=self.config, **kwargs) self.graph = self.graphcls.graph self.graph_head_triples = self.graphcls.graph_head_triples self.graph_tail_triples = self.graphcls.graph_tail_triples self.graph_rel_triples = self.graphcls.graph_rel_triples is_origin_graph = self.is_origin_graph try: self.attr_graph = self.graphcls.attr_graph self.attr_head_triples, self.attr_tail_triples, self.attr_rel_triples \ = self.graphcls.attr_head_triples, self.graphcls.attr_tail_triples, self.graphcls.attr_rel_triples except: pass elif head_triples is None or tail_triples is None or rel_triples is None: head_triples, tail_triples, rel_triples = {}, {}, {} for triple in graph_triples: head, rel, tail = triple incDicWithAdd(head_triples, head, triple) incDicWithAdd(tail_triples, tail, triple) incDicWithAdd(rel_triples, rel, triple) self.graph = graph_triples self.graph_head_triples = head_triples self.graph_tail_triples = tail_triples self.graph_rel_triples = rel_triples is_origin_graph = False else: self.graph = graph_triples self.attr_head_triples = head_triples self.graph_tail_triples = tail_triples self.graph_rel_triples = rel_triples is_origin_graph = False if self.is_dual_graph: self.dual_graphs, self.dict2rel = self.graphcls.get_dual_graph_from_tuples(self.graph, isTrans=None) self.dual_graph_origin, self.dual_graph = self.dual_graphs save_tuples_to_txt(self.dual_graph, self.dual_graph_txt) dump_json(self.origin_dual_rel2dict, self.dict2rel) entity, relations = get_tuples_dict(self.graph) self.total_ent_num = len(entity) self.total_rel_num = len(relations) if is_origin_graph: self.graph_txt = self.triple_txt else: save_tuples_to_txt(self.graph, self.graph_txt)
def get_type_dic(file_name, json_file, type_key='<类型>', encoding_format='utf-8', parse_line=parse_line): lines = read_from_file(file_name) ent_types_maps = {} for line in tqdm(range(len(lines))): line = lines[line].strip() try: ent1, rel, ent2 = parse_line(line) except: print(line) continue if rel == type_key: appendDic(ent_types_maps, ent1, ent2) dump_json(json_file, ent_types_maps) return ent_types_maps
def reverse_graph_tree( cls, filename: str, labelfile: str, outfile: Union[str, None], node2node_renumber: Union[str, None] = None) -> dict: print("filename: {}".format(filename)) lines = read_from_file(filename) print("lines: {}".format(lines)) id2label = load_json(labelfile) node2comm_list = cls.split_graph_tree(lines) last_partition = cls.cluster_community(node2comm_list) if node2node_renumber: node2node = load_json(node2node_renumber) last_partition = cls.reverse_node2label(node2node, last_partition) communs = cls.reverse_node2label(id2label, last_partition) communs_dict = cls.revert_list_to_dict(communs) dump_json(outfile, communs_dict) return communs_dict
def convert_triple_to_needs(filename, outfile, direct=True, save_node2idx=None, weight_property=None): lines = read_from_file(filename) node_idx = {} node_weights = {} for line in tqdm(range(len(lines))): line = lines[line].strip() triple = parse_line(line) if triple is None: continue ent1, rel, ent2 = triple if save_node2idx is not None: ent1idx = node2idx(node_idx, ent1) ent2idx = node2idx(node_idx, ent2) else: ent1idx = int(ent1) ent2idx = int(ent2) val = None try: val = float(rel) except: if weight_property is not None and rel.startswith("\""): val = weight_property key = " ".join([str(ent1idx), str(ent2idx)]) incDicWithWeightAdd(node_weights, key, val) if not direct: key = " ".join([str(ent2idx), str(ent1idx)]) incDicWithWeightAdd(node_weights, key, val) all_out_lines = [] for key, val in tqdm(node_weights.items()): line = key + " " + str(val) + '\n' all_out_lines.append(line) idx_node = {} for node, idx in tqdm(node_idx.items()): idx_node[idx] = node dump_json(save_node2idx, idx_node) write_to_file(outfile, all_out_lines)
def filter_less_use_community(info_dic, left_comm_json, filtered_comm_json, no_rel_comm_json): filtered_comm = {} no_rel_comm = {} left_comm = {} for key, info in info_dic.items(): if info["attr_number"] == 0: this_key = len(no_rel_comm) no_rel_comm[this_key] = info elif info["ent_and_attr_number"] - info["attr_number"] <= 1: this_key = len(filtered_comm) filtered_comm[this_key] = info else: this_key = len(left_comm) left_comm[this_key] = info dump_json(left_comm_json, left_comm) dump_json(filtered_comm_json, filtered_comm) dump_json(no_rel_comm_json, no_rel_comm) return left_comm, filtered_comm, no_rel_comm
def __call__(self, **kwargs): self.modularity.load_graph() community = self.modularity.get_last_community( **self.modularity_setting, **kwargs) all_out_dict = ProcessModularity.revert_list_to_dict(community) dump_json(self.outfile, all_out_dict)