def bio2relation(): TAG = "pred" sdiff = [] relation_types = [] pred_relations_plan1 = [] pred_relations_plan2 = [] mapping = [] typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl") for doc_id, ens in typed_entities.items(): pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{doc_id}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) enids = range(len(ens)) all_pairs = [] for e1, e2 in permutations(enids, 2): all_pairs.append((e1, e2)) for each in all_pairs: eid1, eid2 = each # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER') en1 = ens[eid1] en2 = ens[eid2] if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper( ) == "FAMILYMEMBER": continue sie1 = get_sent_idx(en1[0][3], sent_bound) sie2 = get_sent_idx(en2[0][3], sent_bound) if abs(sie1 - sie2) > GLOBAL_CUTOFF: continue bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2], en1[0], en2[0]) tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels pred_relations_plan1.append([ TAG, tagged_s1, tagged_s2, pure_text1, pure_text2, f"{abs(sie1 - sie2)}", str() ]) tp = generate_bert_relation_without_extra_sentence( sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2) pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"]) mapping.append((doc_id, en1, en2)) prel = Path(REL_TEST) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan2, prel / "test.tsv") prel = Path(REL_TESTa) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan1, prel / "test.tsv")
def read_word_embedding(replace=False): unknown_dir = "embedding/unknown.npy" vectors_dir = "embedding/vectors.npy" words_dir = "embedding/words.json" word_embeddings_dir = "embedding/word_embeddings.npy" word2idx_dir = "embedding/word2idx.json" print("read word embedding") if replace or not os.path.exists( word_embeddings_dir) or not os.path.exists(word2idx_dir): vectors = np.load(vectors_dir) unknown = np.load(unknown_dir) extension = utils.get_file_extension(words_dir)[1:] assert extension in ["json", "pl"] if extension == "json": words = utils.json_load(words_dir) else: words = utils.pkl_load(words_dir) word2idx = {"UNKNOWN": 1, **{w: i + 2 for i, w in enumerate(words)}} vectors = [ unknown, *list(vectors), ] np.save(word_embeddings_dir, vectors) utils.json_dump(word2idx, word2idx_dir) else: word2idx = utils.json_load(word2idx_dir) print("vocab: %d words" % (len(word2idx) - 1)) return word2idx
def bio2typing(res_dir, test_fids, tag=0): res = non_integrated_results(res_dir, test_fids) merged_entities = extract_entities(res, test_fids) ner_typing_root = Path(NER_TYPING_ROOT) ner_typing_root.mkdir(parents=True, exist_ok=True) pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl") for test_fid in test_fids: pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{test_fid}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) ens = merged_entities[test_fid] fm, ls, ob = [], [], [] for en_idx, en in enumerate(ens): # ('son', 'FAMILYMEMBER', (334, 337), (342, 345)) en_span = en[-1] en_type = en[1].lower() sidx = get_sent_idx(en_span, sent_bound) en_loc_sent = sents[sidx] pure_text = pre_txt[sidx] tagged_sent = insert_token_and_creat_text_for_testing( en_loc_sent, en_span) if valida_by_sent(tagged_sent): print(test_fid, en, tagged_sent) if en_type == "familymember": fm.append([ f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text, tagged_sent ]) elif en_type == "observation": ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) elif en_type == "livingstatus": ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) else: raise RuntimeError(f"{en_type} is not recognized for {en}") # # fms, fmr share the same dir pfm = Path(FMS_TEST.format(tag)) pfm.mkdir(exist_ok=True, parents=True) to_tsv(fm, pfm / "test.tsv") pfo = Path(OBN_TEST.format(tag)) pfo.mkdir(exist_ok=True, parents=True) to_tsv(ob, pfo / "test.tsv") pfl = Path(LSS_TEST.format(tag)) pfl.mkdir(exist_ok=True, parents=True) to_tsv(ls, pfl / "test.tsv") pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl") pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl") pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
def get_neighbors(args): neighbor_path = 'ed2_neighbors{}pt{}.pkl'.format(args.save_root, args.perturb_type) neighbor_dict = pkl_load(neighbor_path) while True: print('broad' in neighbor_dict['bold']) inpt = input("Enter a word: ") inpt = inpt.lower() if inpt not in neighbor_dict: print("Word not preprocessed...") else: print("Neighbors for {}:".format(inpt)) print(neighbor_dict[inpt])
def __getitem__(self, index): """ Args: index (int): Index in range [0, self.__len__ - 1] Returns: image: torch.Tensor of size [3,H,W]. bboxes: torch.Tensor of size [n_bbox, 4] i.e. n bboxes each of [top_left_x, top_left_y, bottom_right_x, bottom_right_y] context_indices: torch.Tensor of size [n_bbox, 2*context_size] i.e. bbox indices (0-indexed) of contexts for all n bboxes. If not enough found, rest are -1 labels: torch.Tensor of size [n_bbox] i.e. each value is label of the corresponding bbox """ img_id = self.ids[index] img = Image.open('%s/imgs/%s.png' % (self.root, img_id)).convert('RGB') img = self.img_transform(img) bboxes = pkl_load('%s/bboxes/%s.pkl' % (self.root, img_id)) if self.max_bg_boxes > 0: ## TODO: Make sure order is preserved bg_boxes = bboxes[bboxes[:,-1] == 0] pos_boxes = bboxes[bboxes[:,-1] != 0] indices = np.random.permutation(len(bg_boxes))[:self.max_bg_boxes] bg_boxes = bg_boxes[indices] bboxes = np.concatenate((pos_boxes, bg_boxes), axis=0) labels = torch.LongTensor(bboxes[:,-1]) bboxes = torch.Tensor(bboxes[:,:-1]) bboxes[:,2:] += bboxes[:,:2] # convert from [x,y,w,h] to [x1,y1,x2,y2] context_indices = [] for i in range(bboxes.shape[0]): context = list(range(max(0, i-self.context_size), i)) + list(range(i+1, min(bboxes.shape[0], i+self.context_size+1))) context_indices.append(context + [-1]*(2*self.context_size - len(context))) context_indices = torch.LongTensor(context_indices) return img, bboxes, context_indices, labels
def get_results(test_ds, type): dl = DataLoader(test_ds, batch_size=len(test_ds), num_workers=5) retdict = train.eval_model(model, dl, args.OP_tgt, 3) model_dir = os.path.join('models', args.modelname) try: os.makedirs(os.path.join(model_dir, f'{type}_eval_results')) except: shutil.rmtree(os.path.join(model_dir, f'{type}_eval_results')) os.makedirs(os.path.join(model_dir, f'{type}_eval_results')) retdict['modelname'] = args.modelname # Print pprint.pprint(retdict) utils.pkl_dump( retdict, os.path.join('models', args.modelname, f'{type}_report.dict')) valid_ds, test_ds = SeqDataset(args.dataset, 'valid'), SeqDataset(args.dataset, 'test') X_Mean = utils.pkl_load(os.path.join('data', args.dataset, 'x_mean.pkl')) input_size = X_Mean.size model = model.StackedGRUDClassifier(input_size, args.output_dim, X_Mean, []) model.load_state_dict( torch.load(os.path.join('models', args.modelname, 'checkpoint.pt'))['state_dict']) if os.path.exists(os.path.join('models', args.modelname, 'report.txt')): os.remove(os.path.join('models', args.modelname, 'report.txt')) print(get_results(test_ds, 'test'))
def gen_res_for_subtask2(): mapping = pkl_load(os.path.join(REL_TEST, "relation_mappings.tsv")) rel_preds = load_bert_results(Path(REL_OUTPUT_ROOT) / "test_results.txt") rel_res = format_bert_output(rel_preds, mapping) to_task2_output(rel_res, PRED_SUBTASK_2)
def gen_res_for_subtask1(test_fids): typed_entities_en = [] for tag in range(5): ner_typing_root = Path(NER_TYPING_ROOT) merged_entities = pkl_load(ner_typing_root / f"merged_entities_{tag}.pkl") fm_test_input = pkl_load(ner_typing_root / f"fm_{tag}.pkl") ob_test_input = pkl_load(ner_typing_root / f"ob_{tag}.pkl") ls_test_input = pkl_load(ner_typing_root / f"ls_{tag}.pkl") fms_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fms", tag)) fmr_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("fmr", tag)) obn_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("obn", tag)) lss_test = load_ner_typing_results(CLS_OUTPUT_ROOT.format("lss", tag)) fm_merged_res = merge_test_with_result(fm_test_input, fms_test, fmr_test) ob_merged_res = merge_test_with_result(ob_test_input, obn_test) ls_merged_res = merge_test_with_result(ls_test_input, lss_test) typed_entities = defaultdict(list) for tfid in test_fids: entities = merged_entities[tfid] mrs = fm_merged_res[tfid] + ob_merged_res[tfid] + ls_merged_res[ tfid] for each in mrs: en_id, ner_types = each en = entities[en_id] typed_entities[tfid].append((en, ner_types, en[1])) nd = [] for k, v in typed_entities.items(): # (('sister', 'FamilyMember', (576, 582), (597, 603)), ['NA', 'Sister'], 'FamilyMember') for each in v: nd.append((k, each[0], tuple(each[1]), each[2])) typed_entities_en.extend(nd) final_res = [ e[0] for e in Counter(typed_entities_en).most_common() if e[1] > ENSEMBLE_THRESHOLD ] typed_entities_f = defaultdict(list) for each in final_res: typed_entities_f[each[0]].append((each[1], list(each[2]), each[3])) task1_ens = [] for doc_id, ens in typed_entities_f.items(): for en in ens: new_en = [doc_id] en_type = en[-1] if en_type.upper() == 'FAMILYMEMBER': new_en.append(en_map[en_type]) new_en.append(en[1][1]) new_en.append(en[1][0]) task1_ens.append(new_en) elif en_type.upper() == "OBSERVATION": new_en.append(en_map[en_type]) new_en.append(en[0][0]) task1_ens.append(new_en) pkl_save(typed_entities_f, Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl") task1_ens = sorted({tuple(e) for e in task1_ens}, key=lambda x: int(x[0].split("_")[-1])) with open(PRED_SUBTASK_1, "w") as f: for each in task1_ens: cont = "\t".join(each) f.write(f"{cont}\n")
def main(args): # define location to save the model if args.save == "__": args.save = "save/IMN_%d_%d" % \ (args.lr, args.batch_size) ''' make sure the folder to save models exist ''' if not os.path.exists(args.save): os.mkdir(args.save) in_dir = "../data/pkl/IMN/" datasets = utils.pkl_load(in_dir + "features.pkl") sent_train_index = datasets["train_sent"]["index"] sent_test_index = datasets["test_sent"]["index"] ae_tag_train = datasets["train_sent"]["target_opinion"] as_tag_train = datasets["train_sent"]["target_polarity"] ae_tag_test = datasets["test_sent"]["target_opinion"] as_tag_test = datasets["test_sent"]["target_polarity"] op_label_input_train = datasets["train_sent"]["opinion_ex"] op_label_input_test = datasets["test_sent"]["opinion_ex"] ap_label_input_train = datasets["train_sent"]["aspect_ex"] ap_label_input_test = datasets["test_sent"]["aspect_ex"] train_batch = args.batch_size test_batch = args.batch_size train_sent_set = utils.SentDataset(sent_train_index, ae_tag_train, as_tag_train, op_label_input_train, ap_label_input_train, args.max_len) test_set = utils.SentDataset(sent_test_index, ae_tag_test, as_tag_test, op_label_input_test, ap_label_input_test, args.max_len) test_set_loader = DataLoader(dataset=test_set, batch_size=test_batch, shuffle=False) train_sent_loader = DataLoader(dataset=train_sent_set, batch_size=train_batch, shuffle=True) general_embeddings = utils.pkl_load(in_dir + "general_embeddings.pkl") domain_embeddings = utils.pkl_load(in_dir + "domain_embeddings.pkl") general_embeddings = torch.from_numpy(general_embeddings).float() domain_embeddings = torch.from_numpy(domain_embeddings).float() model = NewImn(general_embeddings, domain_embeddings, ae_nums=args.ae_nums, as_nums=args.as_nums, ds_nums=args.ds_nums, iters=args.iters, dropout=args.dropout, use_transission=args.use_transission) weight_p, bias_p = [], [] for name, p in model.named_parameters(): if 'bias' in name: bias_p += [p] else: weight_p += [p] optimizer = torch.optim.Adam([{ 'params': weight_p, 'weight_decay': 0 }, { 'params': bias_p, 'weight_decay': 0 }], lr=args.lr) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.95, last_epoch=-1) model.cuda() f_aspect_best, f_opinion_best, f_absa_best = -np.inf, -np.inf, -np.inf tic = time.time() print("-----------------------------", args.epochs, len(train_sent_set), args.batch_size) for i in range(args.epochs): gold_prob = get_prob(i) rnd = np.random.uniform() # as epoch increasing, the probability of using gold opinion label descreases. if rnd < gold_prob: p_gold_op_train = np.ones((1, args.max_len)) else: p_gold_op_train = np.zeros((1, args.max_len)) # p_gold_op_train = np.zeros((1, args.max_len)) p_gold_op_train = torch.from_numpy(p_gold_op_train).float() print("--------------\nEpoch %d begins!" % (i)) loss = train(model, optimizer, train_sent_loader, p_gold_op_train) print("loss=", loss) print(" using %.5f seconds" % (time.time() - tic)) tic = time.time() print("\n Begin to predict the results on Validation") p_gold_op_test = torch.from_numpy(np.zeros((1, args.max_len))).float() f_aspect, f_opinion, f_absa = test(model, test_set_loader, p_gold_op_test, name="train") print(" ---%f %f %f---" % (f_aspect, f_opinion, f_absa)) print(" ----Old best aspect f1 score on test is %f" % f_aspect_best) print(" ----Old best opinion f1 score on test is %f" % f_opinion_best) print(" ----Old best ABSA f1 score on test is %f" % f_absa_best) if f_aspect > f_aspect_best: print(" ----New best aspect f1 score on test is %f" % f_aspect) f_aspect_best = f_aspect if f_opinion > f_opinion_best: print(" ----New best opinion f1 score on test is %f" % f_opinion) f_opinion_best = f_opinion if f_absa > f_absa_best: print(" ----New best ABSA f1 score on test is %f" % f_absa) f_absa_best = f_absa with open(args.save + "/model.pt", 'wb') as to_save: torch.save(model, to_save) scheduler.step() print("lr=", optimizer.param_groups[0]['lr']) print("best ABSA f1 score on test is %f" % f_absa_best) print("best Aspect f1 score on test is %f" % f_aspect_best) print("best Opinion f1 score on test is %f" % f_opinion_best) log = f"{args.iters}.log" with open(log, "w") as f: f.write("best ABSA f1 score on test is %f" % f_absa_best) f.write("best Aspect f1 score on test is %f" % f_aspect_best) f.write("best Opinion f1 score on test is %f" % f_opinion_best)
def __getitem__(self, index): ID = self.list_IDs[index] X = utils.pkl_load(os.path.join(self.datadir, ID + '.npy')) # Numpy seq y = self.label_dict[ID] # Dict of available target values return X, y
def __init__(self, datadir, type): self.datadir = os.path.join('data', datadir, type) self.label_dict = utils.pkl_load( os.path.join(self.datadir, 'label_dict.pkl')) self.list_IDs = list(self.label_dict.keys())
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # filename: rander_prob_emit.py import gzip from utils import pkl_load emitP_FILE = "../src/main/resources/prob_emit.p" OUTPUT_TXT_FILE = "../src/main/resources/prob_emit.txt" OUTPUT_GZ_FILE = "../src/main/resources/prob_emit.gz" output = "" for k1, v1 in pkl_load(emitP_FILE).items(): output += f"\n{k1}\n" for k2, v2 in v1.items(): #k2 = k2.encode("unicode-escape").decode("utf-8") # to unicode string output += f"{k2} {v2}D\n" with open(OUTPUT_TXT_FILE, "w", encoding="utf-8") as fp: fp.write(output) with gzip.open(OUTPUT_GZ_FILE, "wb") as fp: fp.write(output.encode("utf-8"))
import java.util.HashMap; import java.util.Map; class Model {""" + "\n" print(header) ### END header ### ### startP ### map_startP = "" map_startP += tab( 1) + r"protected static final Map<Character, Double> startP;" + "\n" map_startP += "\n" map_startP += tab(1) + r"static {" + "\n" map_startP += tab(2) + r"startP = new HashMap<Character, Double>() {{" + "\n" for k1, v1 in pkl_load(startP_FILE).items(): map_startP += tab(3) + f"put('{k1}', {v1}D);" + "\n" map_startP += tab(2) + r"}};" + "\n" map_startP += tab(1) + r"}" + "\n" print(map_startP) ### END startP ### ### transP ### map_transP = "" map_transP += tab( 1 ) + r"protected static final Map<Character, Map<Character, Double>> transP;" + "\n" map_transP += "\n" map_transP += tab(1) + r"static {" + "\n" map_transP += tab( 2) + r"transP = new HashMap<Character, Map<Character, Double>>() {{" + "\n"
def load_data(filename): return utils.pkl_load(f'{config.preprocessedfolder}/{filename}')