def micro_f1(file): label_list = get_labels(task_name="ner") id2label = {i: v for i, v in enumerate(label_list)} datasets = read_dataset(file) TP = FP = FN = 0 for idx, data in enumerate(datasets): TP, FP, FN = cal_tp_fp_fn(idx, data, TP, FP, FN) P = TP / (TP + FP) R = TP / (TP + FN) F1 = 2 * P * R / (P + R) print("Precision is: {}\nRecall is: {}\nF1 score is: {}".format(P, R, F1)) return P, R, F1
def build_name_dict(kb_file, name_pkl): name_dict = defaultdict(list) datas = read_dataset(kb_file) for data in tqdm(datas): data = eval(data) subject = standard_string(data["subject"]) subject_id = data["subject_id"] name_dict[subject].append(subject_id) for alias in data["alias"]: alias = standard_string(alias) if alias != subject: name_dict[alias].append(subject_id) pkl.dump(name_dict, open(name_pkl, "wb"))
def chain(file): res_pred = [] res_gold = [] id2label={"0": "O", "1": "B-ment", "2": "I-ment",} datasets = read_dataset(file) for d in datasets: pred = eval(d.split("###")[0]) pred = [id2label[i] for i in pred] res_pred.extend(pred) gold = eval(d.split("###")[1]) gold = [id2label[i] for i in gold] res_gold.extend(gold) return res_pred, res_gold
def hash_id2abstract(kb_data, id2abstract_pkl): datasets = read_dataset(kb_data) id2abstract = {} for data in datasets: data = eval(data) if len(data["data"]) == 0 or "object" not in data["data"][0]: continue # if data["data"][0]["predicate"] == "摘要": # id2abstract[data["subject_id"]] = (data["subject"], data["data"][0]["object"]) text = "[type]" + " ".join(data["type"]) for d in data["data"]: text += "[" + d["predicate"] + "]" + d["object"] id2abstract[data["subject_id"]] = (data["subject"], text) pkl.dump(id2abstract, open(id2abstract_pkl, "wb"))
def disambi_f1(file): label_list = [0, 1] datasets = read_dataset(file) TP = FP = FN = 0 for idx, data in enumerate(datasets): pred = int(eval(data.split("\t")[0])) gold = int(eval(data.split("\t")[1])) if pred == 1 and pred == gold: TP += 1 elif pred == 1 and pred != gold: FP += 1 elif pred == 0 and pred != gold: FN += 1 P = TP / (TP + FP) R = TP / (TP + FN) F1 = 2 * P * R / (P + R) print("Precision is: {}\nRecall is: {}\nF1 score is: {}".format(P, R, F1)) return P, R, F1
def divide_set(infile): logging.info("Dividing file into train/dev/test...") train_writer = codecs.open("./data/disambi/train.txt", "w", "utf-8") dev_writer = codecs.open("./data/disambi/dev.txt", "w", "utf-8") test_writer = codecs.open("./data/disambi/test.txt", "w", "utf-8") datasets = read_dataset(infile) total_line = int( subprocess.getoutput("wc -l {}".format(infile)).split()[0]) # total_line = 300000 logging.info("total_line: {}".format(total_line)) for idx, data in enumerate(datasets): if idx > total_line: break if idx < 0.8 * total_line: train_writer.write(data) elif idx < 0.9 * total_line: dev_writer.write(data) elif idx < total_line: test_writer.write(data) logging.info("Done")
def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for the test set.""" return self._create_examples(read_dataset("./data/disambi/test.txt"), "test")
def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" return self._create_examples(read_dataset("./data/disambi/dev.txt"), "dev")
def gen_disambi(infile, outfile): id2abstract_pkl = "./data/id2abstract.pkl" if not os.path.exists("./data/disambi/"): subprocess.getoutput("mkdir ./data/disambi/") datasets = read_dataset(infile) if not os.path.exists(id2abstract_pkl): logging.info("Building id2abstract.pkl...") start = time.time() id2abstract = hash_id2abstract("./original_data/kb_data", id2abstract_pkl) logging.info("Build id2abstract.pkl done!, Total time {} s".format( time.time() - start)) else: id2abstract = pkl.load(codecs.open("./data/id2abstract.pkl", "rb")) if not os.path.exists(args.name_dict_path): logging.info( " The name dictionary does not exist and is being created. ") build_name_dict(args.kb_data_path, args.name_dict_path) name_dict = pkl.load(open(args.name_dict_path, "rb")) outwriter = codecs.open(outfile, "w", "utf-8") pos_count = 0 neg_count = 0 total_entity = 0 used_lines = 0 max_leng = 0 for data in tqdm(datasets): data = eval(data) candi_text = data["text"] for mention in data["mention_data"]: if mention["kb_id"] == "NIL": continue source_entity = mention["mention"] offset = int(mention["offset"]) candi_offset = (offset, len(source_entity) + offset) candi_entity = gen_candidate_entity(source_entity, name_dict, mode="exact") used_lines += 1 total_entity += len(candi_entity) if not candi_entity: continue if len(candi_entity) > 20: max_leng += 1 # continue for centity_id in candi_entity: if centity_id not in id2abstract: continue out_line = { "query_entity": source_entity, "query_text": candi_text, "query_offset": candi_offset } out_line["candi_entity"], out_line[ "candi_abstract"] = id2abstract[centity_id] if centity_id == mention["kb_id"]: out_line["tag"] = 1 pos_count += 1 else: out_line["tag"] = 0 neg_count += 1 # out_line["tag"] = 1 if centity_id == mention["kb_id"] else 0 outwriter.write(json.dumps(out_line) + "\n") logging.info("upper max_length: {}".format(max_leng)) logging.info( "Communist sample {}, of which positive {}, negative {} ".format( pos_count + neg_count, pos_count, neg_count)) logging.info("Avg candidate entity length: {}".format(total_entity / used_lines))