Ejemplo n.º 1
0
def preprocess_vocab(args):
    if args.vocab_type == 'glove':
        vocab = load_glove_vocab(GLOVE_PATH)
    elif args.vocab_type == 'lm':
        query_handler = load_language_model()
        vocab = vocab_from_lm(query_handler)
    else:
        raise ValueError("Invalid vocab type of {}".format(args.vocab_type))
    print("Vocab sample: ", vocab[300:500])
    sub_dict = None
    if args.modify_end:
        print("Modifying the end...")
    if args.perturb_type == 'ed1':
        typo2vocab, ed2_neighbors, neighbor_trans_map = preprocess_neighbors(
            vocab, filetype=args.filetype, sub_restrict=sub_dict)
    elif args.perturb_type == 'intprm':
        typo2vocab, ed2_neighbors, neighbor_trans_map = preprocess_neighbors_intprm(
            vocab)
    else:
        raise NotImplementedError
    #print("Typo2vocab bos: ", typo2vocab['bos'])
    print(len(typo2vocab))
    print(len(ed2_neighbors))
    #pkl_save(typo2vocab, 'typo2vocab.pkl')
    pkl_save(
        ed2_neighbors,
        'ed2_neighbors{}pt{}.pkl'.format(args.save_root, args.perturb_type))
    print("Saved ed2")
Ejemplo n.º 2
0
def save_clusterer(vocab_size = 100000, perturb_type = 'ed1', save_dir = 'clusterers',
                    check_perturb_size = False):

    filename = 'vocab{}_{}.pkl'.format(vocab_size, perturb_type)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, filename)
    print("Will save at: {}".format(save_path))

    #Initializing clusterer
    clusterer = Clusterer(vocab_size = vocab_size)
    #Initializing the graph
    clusterer.construct_graph(perturb_type = perturb_type)
    #Creating clusters.
    clusterer.construct_clusters()

    #Option to analyze number of perturbations, etc. 
    #if check_perturb_size:
    #    get_vocab_statistics(clusterer.vertices)
    #    return
    save_dict = {'cluster': clusterer.clusters,
                    'word2cluster': clusterer.word2cluster,
                    'cluster2representative': clusterer.cluster2representative,
                    'word2freq': clusterer.word2freq,
                    'typo2cluster': clusterer.typo2cluster}

    print("Saving everything at: ", save_path)
    pkl_save(save_dict, save_path)
    print("Number of clusters: {}, vocab size: {}".format(len(clusterer.clusters), vocab_size))
def bio2relation():
    TAG = "pred"
    sdiff = []
    relation_types = []
    pred_relations_plan1 = []
    pred_relations_plan2 = []
    mapping = []

    typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl")
    for doc_id, ens in typed_entities.items():
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{doc_id}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)
        enids = range(len(ens))
        all_pairs = []
        for e1, e2 in permutations(enids, 2):
            all_pairs.append((e1, e2))

        for each in all_pairs:
            eid1, eid2 = each
            # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER')
            en1 = ens[eid1]
            en2 = ens[eid2]
            if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper(
            ) == "FAMILYMEMBER":
                continue

            sie1 = get_sent_idx(en1[0][3], sent_bound)
            sie2 = get_sent_idx(en2[0][3], sent_bound)
            if abs(sie1 - sie2) > GLOBAL_CUTOFF:
                continue

            bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2],
                                                 en1[0], en2[0])
            tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels
            pred_relations_plan1.append([
                TAG, tagged_s1, tagged_s2, pure_text1, pure_text2,
                f"{abs(sie1 - sie2)}",
                str()
            ])
            tp = generate_bert_relation_without_extra_sentence(
                sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2)
            pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"])
            mapping.append((doc_id, en1, en2))

    prel = Path(REL_TEST)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan2, prel / "test.tsv")

    prel = Path(REL_TESTa)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan1, prel / "test.tsv")
def bio2typing(res_dir, test_fids, tag=0):
    res = non_integrated_results(res_dir, test_fids)
    merged_entities = extract_entities(res, test_fids)
    ner_typing_root = Path(NER_TYPING_ROOT)
    ner_typing_root.mkdir(parents=True, exist_ok=True)
    pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl")
    for test_fid in test_fids:
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{test_fid}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)

        ens = merged_entities[test_fid]
        fm, ls, ob = [], [], []
        for en_idx, en in enumerate(ens):
            # ('son', 'FAMILYMEMBER', (334, 337), (342, 345))
            en_span = en[-1]
            en_type = en[1].lower()
            sidx = get_sent_idx(en_span, sent_bound)
            en_loc_sent = sents[sidx]
            pure_text = pre_txt[sidx]
            tagged_sent = insert_token_and_creat_text_for_testing(
                en_loc_sent, en_span)

            if valida_by_sent(tagged_sent):
                print(test_fid, en, tagged_sent)

            if en_type == "familymember":
                fm.append([
                    f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text,
                    tagged_sent
                ])
            elif en_type == "observation":
                ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            elif en_type == "livingstatus":
                ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            else:
                raise RuntimeError(f"{en_type} is not recognized for {en}")

        # # fms, fmr share the same dir
        pfm = Path(FMS_TEST.format(tag))
        pfm.mkdir(exist_ok=True, parents=True)
        to_tsv(fm, pfm / "test.tsv")

        pfo = Path(OBN_TEST.format(tag))
        pfo.mkdir(exist_ok=True, parents=True)
        to_tsv(ob, pfo / "test.tsv")

        pfl = Path(LSS_TEST.format(tag))
        pfl.mkdir(exist_ok=True, parents=True)
        to_tsv(ls, pfl / "test.tsv")

        pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl")
        pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl")
        pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
def gen_res_for_subtask1(test_fids):
    typed_entities_en = []

    for tag in range(5):
        ner_typing_root = Path(NER_TYPING_ROOT)

        merged_entities = pkl_load(ner_typing_root /
                                   f"merged_entities_{tag}.pkl")
        fm_test_input = pkl_load(ner_typing_root / f"fm_{tag}.pkl")
        ob_test_input = pkl_load(ner_typing_root / f"ob_{tag}.pkl")
        ls_test_input = pkl_load(ner_typing_root / f"ls_{tag}.pkl")

        fms_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fms", tag))
        fmr_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fmr", tag))
        obn_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("obn", tag))
        lss_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("lss", tag))

        fm_merged_res = merge_test_with_result(fm_test_input, fms_test,
                                               fmr_test)
        ob_merged_res = merge_test_with_result(ob_test_input, obn_test)
        ls_merged_res = merge_test_with_result(ls_test_input, lss_test)

        typed_entities = defaultdict(list)
        for tfid in test_fids:
            entities = merged_entities[tfid]
            mrs = fm_merged_res[tfid] + ob_merged_res[tfid] + ls_merged_res[
                tfid]
            for each in mrs:
                en_id, ner_types = each
                en = entities[en_id]
                typed_entities[tfid].append((en, ner_types, en[1]))

        nd = []
        for k, v in typed_entities.items():
            # (('sister', 'FamilyMember', (576, 582), (597, 603)), ['NA', 'Sister'], 'FamilyMember')
            for each in v:
                nd.append((k, each[0], tuple(each[1]), each[2]))
        typed_entities_en.extend(nd)

    final_res = [
        e[0] for e in Counter(typed_entities_en).most_common()
        if e[1] > ENSEMBLE_THRESHOLD
    ]

    typed_entities_f = defaultdict(list)
    for each in final_res:
        typed_entities_f[each[0]].append((each[1], list(each[2]), each[3]))

    task1_ens = []
    for doc_id, ens in typed_entities_f.items():
        for en in ens:
            new_en = [doc_id]
            en_type = en[-1]
            if en_type.upper() == 'FAMILYMEMBER':
                new_en.append(en_map[en_type])
                new_en.append(en[1][1])
                new_en.append(en[1][0])
                task1_ens.append(new_en)
            elif en_type.upper() == "OBSERVATION":
                new_en.append(en_map[en_type])
                new_en.append(en[0][0])
                task1_ens.append(new_en)

    pkl_save(typed_entities_f,
             Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl")

    task1_ens = sorted({tuple(e)
                        for e in task1_ens},
                       key=lambda x: int(x[0].split("_")[-1]))

    with open(PRED_SUBTASK_1, "w") as f:
        for each in task1_ens:
            cont = "\t".join(each)
            f.write(f"{cont}\n")
Ejemplo n.º 6
0
def pickle_data(cdata, filename, folder=None):
    folder = f'{config.preprocessedfolder}/{folder}' if folder is not None else f'{config.preprocessedfolder}'
    if not os.path.exits(folder):
        os.mkdirs(folder)
    utils.pkl_save(cdata, f'{folder}/{filename}')