def decode(hp): tokenizer = BertTokenizer.from_pretrained(hp.bert_model_dir, do_lower_case=False) if hp.dataset == "lm_raw_data_finance": dict_file = "../data/dataset_finance/annual_report_entity_list" elif hp.dataset == "lm_raw_data_novel": dict_file = "../data/dataset_book9/entity_book9" entity_dict = EntityDict(hp.dataset, dict_file) entity_dict.load(os.path.join(hp.bert_model_dir, 'entity.dict')) device = 'cuda' if torch.cuda.is_available() else 'cpu' ner_label = NerLabel([hp.decodeset]) if os.path.exists(os.path.join(hp.logdir, 'dict.pt')): ner_label.load(os.path.join(hp.logdir, 'dict.pt')) else: print('dict.pt is not exit') exit() decode_dataset = NerDataset(hp.decodeset, ner_label, tokenizer, entity_dict) model = Net(hp.bert_model_dir, hp.top_rnns, len(ner_label.VOCAB), entity_dict.entity_num, device, hp.finetuning).to(device) model = nn.DataParallel(model) ## Load the model parameters if os.path.exists(os.path.join(hp.logdir, 'model.pt')): model.load_state_dict(torch.load(os.path.join(hp.logdir, 'model.pt'))) else: print("the pretrianed model path does not exist! ") exit() decode_iter = data.DataLoader(dataset=decode_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4, collate_fn=pad) fname = os.path.join(hp.logdir, '_') precision, recall, f1 = evaluate(model, decode_iter, fname, ner_label, verbose=False)
if __name__=="__main__": parser = argparse.ArgumentParser() parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--lr", type=float, default=0.0001) parser.add_argument("--n_epochs", type=int, default=30) parser.add_argument("--finetuning", dest="finetuning", action="store_true") parser.add_argument("--explainable", dest="explainable", action="store_true") parser.add_argument("--top_rnns", dest="top_rnns", action="store_true") parser.add_argument("--logdir", type=str, default="checkpoints/01") parser.add_argument("--trainset", type=str, default="conll2003/train.txt") parser.add_argument("--validset", type=str, default="conll2003/valid.txt") hp = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' train_dataset = NerDataset(hp.trainset) eval_dataset = NerDataset(hp.validset) if hp.explainable: print('WARNING: use EXPLAINABLE mode') maxlen = max(train_dataset.maxlen, eval_dataset.maxlen) _pad = partial(pad, maxlen=maxlen) model = Net(top_rnns=hp.top_rnns, vocab_size=len(VOCAB), device=device, finetuning=hp.finetuning, explainable=hp.explainable).to(device) model.init_tf(maxlen=maxlen) #model = nn.DataParallel(model) else: model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).to(device) model = nn.DataParallel(model) _pad = pad
def train(hp): tokenizer = BertTokenizer.from_pretrained(hp.bert_model_dir, do_lower_case=False) if hp.dataset == "lm_raw_data_finance": dict_file = "../data/dataset_finance/raw_data/annual_report_entity_list" elif hp.dataset == "lm_raw_data_novel": dict_file = "../data/dataset_book9/raw_data/entity_book9" elif hp.dataset == "lm_raw_data_thuner": dict_file = "../data/dataset_thuner/raw_data/thu_entity.txt" entity_dict = EntityDict(hp.dataset, dict_file) entity_dict.load(os.path.join(hp.bert_model_dir, 'entity.dict')) device = 'cuda' if torch.cuda.is_available() else 'cpu' ner_label = NerLabel([hp.trainset, hp.validset]) fname = os.path.join(hp.logdir, 'dict.pt') ner_label.save(fname) train_dataset = NerDataset(hp.trainset, ner_label, tokenizer, entity_dict) eval_dataset = NerDataset(hp.validset, ner_label, tokenizer, entity_dict) test_dataset = NerDataset(hp.testset, ner_label, tokenizer, entity_dict) model = Net(hp.bert_model_dir, hp.top_rnns, len(ner_label.VOCAB), entity_dict.entity_num, device, hp.finetuning).to(device) device_ids = [0, 1] model = nn.DataParallel(model, device_ids=device_ids) train_iter = data.DataLoader(dataset=train_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4, collate_fn=pad) eval_iter = data.DataLoader(dataset=eval_dataset, batch_size=hp.batch_size, shuffle=False, num_workers=4, collate_fn=pad) test_iter = data.DataLoader(dataset=test_dataset, batch_size=hp.batch_size, shuffle=False, num_workers=4, collate_fn=pad) optimizer = optim.Adam(model.parameters(), lr=hp.lr) criterion = nn.CrossEntropyLoss(ignore_index=0) ## train the model best_eval = -10 for epoch in range(1, hp.n_epochs + 1): train_epoch(model, train_iter, optimizer, criterion, tokenizer) print(f"=========eval at epoch={epoch}=========") if not os.path.exists(hp.logdir): os.makedirs(hp.logdir) fname = os.path.join(hp.logdir, 'model') precision, recall, f1 = evaluate(model, eval_iter, fname, ner_label, verbose=False) if f1 > best_eval: best_eval = f1 print("epoch{} get the best eval f-score:{}".format(epoch, best_eval)) torch.save(model.state_dict(), f"{fname}.pt") print(f"weights were saved to {fname}.pt") print(f"=========test at epoch={epoch}=========") if not os.path.exists(hp.logdir): os.makedirs(hp.logdir) fname = os.path.join(hp.logdir, str(epoch)) precision, recall, f1 = evaluate(model, test_iter, fname, ner_label, verbose=False)
hp = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' print("=========build model=========") model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).cuda() # model=1 print("=========parallel=========") model = nn.DataParallel(model) df_case = pd.read_csv('df_case.csv') # make_ner_txt(df_case.inf,hp.testset) print("=========load data=========") test_dataset = NerDataset(hp.testset) test_iter = data.DataLoader(dataset=test_dataset, batch_size=hp.batch_size, shuffle=False, num_workers=4, collate_fn=pad) optimizer = optim.Adam(model.parameters(), lr=hp.lr) criterion = nn.CrossEntropyLoss(ignore_index=0) # f1_best=0 # fname = os.path.join(hp.logdir, 'model') # print("=========ner=========") # test_model=Net(hp.top_rnns, len(VOCAB), device, hp.finetuning) # test_model = nn.DataParallel(test_model)
torch.set_rng_state(state['torch_rng_state']) torch.cuda.set_rng_state(state['cuda_rng_state']), torch.cuda.set_rng_state_all(state['cuda_rng_state_all']), np.random.set_state(state['np_rng_state']) random.setstate(state['random_rng_state']) if state.get('model') is not None and model is not None: model.load_state_dict(state['model']) if __name__ == "__main__": for fold_iter_idx in range(args.fold_num): if args.do_train: Fold_iter_idx = fold_iter_idx # Prepare Documents if args.task.lower() == 'bc2gm': train_dataset = NerDataset(f"../{args.task}/train.tsv", args.task, args.pretrain_dir) eval_dataset = NerDataset(f"../{args.task}/test.tsv", args.task, args.pretrain_dir) elif args.task.lower() == 'bc6pm': if args.do_cross_valid: if fold_iter_idx == 0: if args.train_withGNP: train_data_dir = os.path.join( os.environ.get('BC6PM_dir'), 'GNormPlus', 'withAnn-Result', 'PMtask_Relations_TrainingSet_r.json') else: train_data_dir = os.path.join( os.environ.get('BC6PM_dir'), 'json', 'PMtask_Relations_TrainingSet.json') with open(train_data_dir) as f:
fout.write(f"precision={score[0]}\n") fout.write(f"recall={score[1]}\n") fout.write(f"f1={score[2]}\n") os.remove(f) print("precision=%.2f" % score[0]) print("recall=%.2f" % score[1]) print("f1=%.2f" % score[2]) return score[0], score[1], score[2] if __name__ == "__main__": train_dataset = NerDataset("Data/train.tsv", 'i2b2') eval_dataset = NerDataset("Data/test.tsv", 'i2b2') # Define model config = BertConfig( vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) model = Net(config=config, bert_state_dict=model_state_dict, vocab_len=len(hp.VOCAB), device=hp.device) # 'bc5cdr': ('<PAD>', 'B-Chemical', 'O', 'B-Disease' , 'I-Disease', 'I-Chemical'), class_sample_count = [ 10, 1, 20, 3, 4 ] # dataset has 10 class-1 samples, 1 class-2 samples, etc.
fout.write(f"precision={precision}\n") fout.write(f"recall={recall}\n") fout.write(f"f1={f1}\n") os.remove(f) print("precision=%.2f" % precision) print("recall=%.2f" % recall) print("f1=%.2f" % f1) return precision, recall, f1 if __name__ == "__main__": train_dataset = NerDataset("data/train.tsv", 'bc5cdr') # here bc5cdr is dataset type eval_dataset = NerDataset("data/test.tsv", 'bc5cdr') hp = HParams('bc5cdr') # Define model config = BertConfig( vocab_size_or_config_json_file=parameters.BERT_CONFIG_FILE) model = Net(config=config, bert_state_dict=state_dict, vocab_len=len(hp.VOCAB), device=hp.device) if torch.cuda.is_available(): model.cuda() model.train() # update with already pretrained weight
fout.write(f"precision={precision}\n") fout.write(f"recall={recall}\n") fout.write(f"f1={f1}\n") os.remove(f) print("precision=%.2f"%precision) print("recall=%.2f"%recall) print("f1=%.2f"%f1) return precision, recall, f1 if __name__=="__main__": model = Net(training=False) model.to('cuda') train_dataset = NerDataset("conll2003/train.txt") eval_dataset = NerDataset("conll2003/valid.txt") train_iter = data.DataLoader(dataset=train_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4, collate_fn=pad) eval_iter = data.DataLoader(dataset=eval_dataset, batch_size=hp.batch_size, shuffle=False, num_workers=4, collate_fn=pad) optimizer = optim.Adam(model.parameters(), lr = hp.lr) # optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)