Example #1
0
def micro_f1(file):
    label_list = get_labels(task_name="ner")
    id2label = {i: v for i, v in enumerate(label_list)}
    datasets = read_dataset(file)
    TP = FP = FN = 0
    for idx, data in enumerate(datasets):
        TP, FP, FN = cal_tp_fp_fn(idx, data, TP, FP, FN)
    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print("Precision is: {}\nRecall is: {}\nF1 score is: {}".format(P, R, F1))
    return P, R, F1
def build_name_dict(kb_file, name_pkl):
    name_dict = defaultdict(list)
    datas = read_dataset(kb_file)
    for data in tqdm(datas):
        data = eval(data)
        subject = standard_string(data["subject"])
        subject_id = data["subject_id"]
        name_dict[subject].append(subject_id)
        for alias in data["alias"]:
            alias = standard_string(alias)
            if alias != subject:
                name_dict[alias].append(subject_id)

    pkl.dump(name_dict, open(name_pkl, "wb"))
Example #3
0
def chain(file):
    res_pred = []
    res_gold = []
    id2label={"0": "O",
              "1": "B-ment",
              "2": "I-ment",}
    datasets = read_dataset(file)
    for d in datasets:
        pred = eval(d.split("###")[0])
        pred = [id2label[i] for i in pred]
        res_pred.extend(pred)
        gold = eval(d.split("###")[1])
        gold = [id2label[i] for i in gold]
        res_gold.extend(gold)
    return res_pred, res_gold
def hash_id2abstract(kb_data, id2abstract_pkl):
    datasets = read_dataset(kb_data)
    id2abstract = {}
    for data in datasets:
        data = eval(data)
        if len(data["data"]) == 0 or "object" not in data["data"][0]:
            continue
#        if data["data"][0]["predicate"] == "摘要":
#            id2abstract[data["subject_id"]] = (data["subject"], data["data"][0]["object"])
        text  = "[type]" + " ".join(data["type"])
        for d in data["data"]:
            text += "[" + d["predicate"] + "]" + d["object"]
        id2abstract[data["subject_id"]] = (data["subject"], text)

    pkl.dump(id2abstract, open(id2abstract_pkl, "wb"))
Example #5
0
def disambi_f1(file):
    label_list = [0, 1]
    datasets = read_dataset(file)
    TP = FP = FN = 0
    for idx, data in enumerate(datasets):
        pred = int(eval(data.split("\t")[0]))
        gold = int(eval(data.split("\t")[1]))
        if pred == 1 and pred == gold:
            TP += 1
        elif pred == 1 and pred != gold:
            FP += 1
        elif pred == 0 and pred != gold:
            FN += 1
    P = TP / (TP + FP)
    R = TP / (TP + FN)
    F1 = 2 * P * R / (P + R)
    print("Precision is: {}\nRecall is: {}\nF1 score is: {}".format(P, R, F1))
    return P, R, F1
def divide_set(infile):
    logging.info("Dividing file into train/dev/test...")
    train_writer = codecs.open("./data/disambi/train.txt", "w", "utf-8")
    dev_writer = codecs.open("./data/disambi/dev.txt", "w", "utf-8")
    test_writer = codecs.open("./data/disambi/test.txt", "w", "utf-8")
    datasets = read_dataset(infile)
    total_line = int(
        subprocess.getoutput("wc -l {}".format(infile)).split()[0])
    #    total_line = 300000
    logging.info("total_line: {}".format(total_line))
    for idx, data in enumerate(datasets):
        if idx > total_line:
            break
        if idx < 0.8 * total_line:
            train_writer.write(data)
        elif idx < 0.9 * total_line:
            dev_writer.write(data)
        elif idx < total_line:
            test_writer.write(data)
    logging.info("Done")
 def get_test_examples(self, data_dir):
     """Gets a collection of `InputExample`s for the test set."""
     return self._create_examples(read_dataset("./data/disambi/test.txt"), "test")
 def get_dev_examples(self, data_dir):
     """Gets a collection of `InputExample`s for the dev set."""
     return self._create_examples(read_dataset("./data/disambi/dev.txt"), "dev")
def gen_disambi(infile, outfile):
    id2abstract_pkl = "./data/id2abstract.pkl"

    if not os.path.exists("./data/disambi/"):
        subprocess.getoutput("mkdir ./data/disambi/")
    datasets = read_dataset(infile)
    if not os.path.exists(id2abstract_pkl):
        logging.info("Building id2abstract.pkl...")
        start = time.time()
        id2abstract = hash_id2abstract("./original_data/kb_data",
                                       id2abstract_pkl)
        logging.info("Build id2abstract.pkl done!,  Total time {} s".format(
            time.time() - start))
    else:
        id2abstract = pkl.load(codecs.open("./data/id2abstract.pkl", "rb"))

    if not os.path.exists(args.name_dict_path):
        logging.info(
            " The name dictionary does not exist and is being created. ")
        build_name_dict(args.kb_data_path, args.name_dict_path)
    name_dict = pkl.load(open(args.name_dict_path, "rb"))

    outwriter = codecs.open(outfile, "w", "utf-8")
    pos_count = 0
    neg_count = 0
    total_entity = 0
    used_lines = 0
    max_leng = 0
    for data in tqdm(datasets):
        data = eval(data)
        candi_text = data["text"]
        for mention in data["mention_data"]:
            if mention["kb_id"] == "NIL":
                continue
            source_entity = mention["mention"]
            offset = int(mention["offset"])
            candi_offset = (offset, len(source_entity) + offset)
            candi_entity = gen_candidate_entity(source_entity,
                                                name_dict,
                                                mode="exact")
            used_lines += 1
            total_entity += len(candi_entity)
            if not candi_entity:
                continue
            if len(candi_entity) > 20:
                max_leng += 1
#                continue
            for centity_id in candi_entity:
                if centity_id not in id2abstract:
                    continue
                out_line = {
                    "query_entity": source_entity,
                    "query_text": candi_text,
                    "query_offset": candi_offset
                }
                out_line["candi_entity"], out_line[
                    "candi_abstract"] = id2abstract[centity_id]
                if centity_id == mention["kb_id"]:
                    out_line["tag"] = 1
                    pos_count += 1
                else:
                    out_line["tag"] = 0
                    neg_count += 1


#                out_line["tag"] = 1 if centity_id == mention["kb_id"] else 0
                outwriter.write(json.dumps(out_line) + "\n")
    logging.info("upper max_length: {}".format(max_leng))
    logging.info(
        "Communist sample {}, of which positive {}, negative {} ".format(
            pos_count + neg_count, pos_count, neg_count))
    logging.info("Avg candidate entity length: {}".format(total_entity /
                                                          used_lines))