def preprocess_vocab(args): if args.vocab_type == 'glove': vocab = load_glove_vocab(GLOVE_PATH) elif args.vocab_type == 'lm': query_handler = load_language_model() vocab = vocab_from_lm(query_handler) else: raise ValueError("Invalid vocab type of {}".format(args.vocab_type)) print("Vocab sample: ", vocab[300:500]) sub_dict = None if args.modify_end: print("Modifying the end...") if args.perturb_type == 'ed1': typo2vocab, ed2_neighbors, neighbor_trans_map = preprocess_neighbors( vocab, filetype=args.filetype, sub_restrict=sub_dict) elif args.perturb_type == 'intprm': typo2vocab, ed2_neighbors, neighbor_trans_map = preprocess_neighbors_intprm( vocab) else: raise NotImplementedError #print("Typo2vocab bos: ", typo2vocab['bos']) print(len(typo2vocab)) print(len(ed2_neighbors)) #pkl_save(typo2vocab, 'typo2vocab.pkl') pkl_save( ed2_neighbors, 'ed2_neighbors{}pt{}.pkl'.format(args.save_root, args.perturb_type)) print("Saved ed2")
def save_clusterer(vocab_size = 100000, perturb_type = 'ed1', save_dir = 'clusterers', check_perturb_size = False): filename = 'vocab{}_{}.pkl'.format(vocab_size, perturb_type) if not os.path.isdir(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, filename) print("Will save at: {}".format(save_path)) #Initializing clusterer clusterer = Clusterer(vocab_size = vocab_size) #Initializing the graph clusterer.construct_graph(perturb_type = perturb_type) #Creating clusters. clusterer.construct_clusters() #Option to analyze number of perturbations, etc. #if check_perturb_size: # get_vocab_statistics(clusterer.vertices) # return save_dict = {'cluster': clusterer.clusters, 'word2cluster': clusterer.word2cluster, 'cluster2representative': clusterer.cluster2representative, 'word2freq': clusterer.word2freq, 'typo2cluster': clusterer.typo2cluster} print("Saving everything at: ", save_path) pkl_save(save_dict, save_path) print("Number of clusters: {}, vocab size: {}".format(len(clusterer.clusters), vocab_size))
def bio2relation(): TAG = "pred" sdiff = [] relation_types = [] pred_relations_plan1 = [] pred_relations_plan2 = [] mapping = [] typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl") for doc_id, ens in typed_entities.items(): pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{doc_id}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) enids = range(len(ens)) all_pairs = [] for e1, e2 in permutations(enids, 2): all_pairs.append((e1, e2)) for each in all_pairs: eid1, eid2 = each # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER') en1 = ens[eid1] en2 = ens[eid2] if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper( ) == "FAMILYMEMBER": continue sie1 = get_sent_idx(en1[0][3], sent_bound) sie2 = get_sent_idx(en2[0][3], sent_bound) if abs(sie1 - sie2) > GLOBAL_CUTOFF: continue bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2], en1[0], en2[0]) tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels pred_relations_plan1.append([ TAG, tagged_s1, tagged_s2, pure_text1, pure_text2, f"{abs(sie1 - sie2)}", str() ]) tp = generate_bert_relation_without_extra_sentence( sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2) pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"]) mapping.append((doc_id, en1, en2)) prel = Path(REL_TEST) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan2, prel / "test.tsv") prel = Path(REL_TESTa) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan1, prel / "test.tsv")
def bio2typing(res_dir, test_fids, tag=0): res = non_integrated_results(res_dir, test_fids) merged_entities = extract_entities(res, test_fids) ner_typing_root = Path(NER_TYPING_ROOT) ner_typing_root.mkdir(parents=True, exist_ok=True) pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl") for test_fid in test_fids: pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{test_fid}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) ens = merged_entities[test_fid] fm, ls, ob = [], [], [] for en_idx, en in enumerate(ens): # ('son', 'FAMILYMEMBER', (334, 337), (342, 345)) en_span = en[-1] en_type = en[1].lower() sidx = get_sent_idx(en_span, sent_bound) en_loc_sent = sents[sidx] pure_text = pre_txt[sidx] tagged_sent = insert_token_and_creat_text_for_testing( en_loc_sent, en_span) if valida_by_sent(tagged_sent): print(test_fid, en, tagged_sent) if en_type == "familymember": fm.append([ f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text, tagged_sent ]) elif en_type == "observation": ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) elif en_type == "livingstatus": ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) else: raise RuntimeError(f"{en_type} is not recognized for {en}") # # fms, fmr share the same dir pfm = Path(FMS_TEST.format(tag)) pfm.mkdir(exist_ok=True, parents=True) to_tsv(fm, pfm / "test.tsv") pfo = Path(OBN_TEST.format(tag)) pfo.mkdir(exist_ok=True, parents=True) to_tsv(ob, pfo / "test.tsv") pfl = Path(LSS_TEST.format(tag)) pfl.mkdir(exist_ok=True, parents=True) to_tsv(ls, pfl / "test.tsv") pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl") pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl") pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
def gen_res_for_subtask1(test_fids): typed_entities_en = [] for tag in range(5): ner_typing_root = Path(NER_TYPING_ROOT) merged_entities = pkl_load(ner_typing_root / f"merged_entities_{tag}.pkl") fm_test_input = pkl_load(ner_typing_root / f"fm_{tag}.pkl") ob_test_input = pkl_load(ner_typing_root / f"ob_{tag}.pkl") ls_test_input = pkl_load(ner_typing_root / f"ls_{tag}.pkl") fms_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fms", tag)) fmr_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fmr", tag)) obn_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("obn", tag)) lss_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("lss", tag)) fm_merged_res = merge_test_with_result(fm_test_input, fms_test, fmr_test) ob_merged_res = merge_test_with_result(ob_test_input, obn_test) ls_merged_res = merge_test_with_result(ls_test_input, lss_test) typed_entities = defaultdict(list) for tfid in test_fids: entities = merged_entities[tfid] mrs = fm_merged_res[tfid] + ob_merged_res[tfid] + ls_merged_res[ tfid] for each in mrs: en_id, ner_types = each en = entities[en_id] typed_entities[tfid].append((en, ner_types, en[1])) nd = [] for k, v in typed_entities.items(): # (('sister', 'FamilyMember', (576, 582), (597, 603)), ['NA', 'Sister'], 'FamilyMember') for each in v: nd.append((k, each[0], tuple(each[1]), each[2])) typed_entities_en.extend(nd) final_res = [ e[0] for e in Counter(typed_entities_en).most_common() if e[1] > ENSEMBLE_THRESHOLD ] typed_entities_f = defaultdict(list) for each in final_res: typed_entities_f[each[0]].append((each[1], list(each[2]), each[3])) task1_ens = [] for doc_id, ens in typed_entities_f.items(): for en in ens: new_en = [doc_id] en_type = en[-1] if en_type.upper() == 'FAMILYMEMBER': new_en.append(en_map[en_type]) new_en.append(en[1][1]) new_en.append(en[1][0]) task1_ens.append(new_en) elif en_type.upper() == "OBSERVATION": new_en.append(en_map[en_type]) new_en.append(en[0][0]) task1_ens.append(new_en) pkl_save(typed_entities_f, Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl") task1_ens = sorted({tuple(e) for e in task1_ens}, key=lambda x: int(x[0].split("_")[-1])) with open(PRED_SUBTASK_1, "w") as f: for each in task1_ens: cont = "\t".join(each) f.write(f"{cont}\n")
def pickle_data(cdata, filename, folder=None): folder = f'{config.preprocessedfolder}/{folder}' if folder is not None else f'{config.preprocessedfolder}' if not os.path.exits(folder): os.mkdirs(folder) utils.pkl_save(cdata, f'{folder}/{filename}')