def get_dataloader(data_path, preprocessor, batch_size): train_dataset = NerDataset(os.path.join(data_path, "train_data.txt"), preprocessor) val_dataset = NerDataset(os.path.join(data_path, "val_data.txt"), preprocessor) test_dataset = NerDataset(os.path.join(data_path, "test_data.txt"), preprocessor) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, batch_size=batch_size, sampler=train_sampler ) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, drop_last=True) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True) return train_dataloader, val_dataloader, test_dataloader
def run_ner_infer(sent): device = 'cuda' if torch.cuda.is_available() else 'cpu' top_rnns = True model = Net(top_rnns, len(VOCAB), device, finetuning=True) if device == 'cpu': model.load_state_dict( torch.load('models/banner_model.pt', map_location=torch.device('cpu'))) elif device == 'cuda': model.load_state_dict(torch.load('.models/banner_model.pt')) model.to(device) tags = [] for x in range(len(sent.split())): tags.append('O') sent_infer = [] sent_infer.append(["[CLS]"] + sent.split() + ["[SEP]"]) tags_infer = [] tags_infer.append(["<PAD>"] + tags + ["<PAD>"]) infer_data = NerDataset(sent_infer, tags_infer) infer_iter = torch.utils.data.DataLoader(dataset=infer_data, batch_size=1, shuffle=False, collate_fn=pad, num_workers=0) pred = eval(model, infer_iter) for x in range(len(pred[0])): if pred[0][x] == '<PAD>': pred[0][x] = 'O' return sent_infer[0][1:-1], pred[0][1:-1]
def main(): args = parse() if args.use_pretrained_embedding: model_version = 'embedding/adl-pretrained-model/bert-embedding-epoch-9/' else: model_version = 'cl-tohoku/bert-base-japanese' model, cus_model, device = setting(model_version, args) if args.bert: trainset = EE_dataset(mode='train') validset = EE_dataset(mode='valid') train(trainset, validset, device, model, cus_model, BATCH_SIZE=args.batch_size) elif args.bert_bilstm_crf: tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese', do_lower_case=True) trainset = NerDataset('./BIO_data/BIO_data.pkl', tokenizer, tag2idx) validset = NerDataset('./BIO_data/BIO_data_dev.pkl', tokenizer, tag2idx) train_with_crf(model, trainset, validset, device, 5)
tags_li_train.append(["<PAD>"] + y + ["<PAD>"]) sents_valid, tags_li_valid = [], [] for x in valid_texts: sents_valid.append(["[CLS]"] + x + ["[SEP]"]) for y in valid_labels: tags_li_valid.append(["<PAD>"] + y + ["<PAD>"]) sents_test, tags_li_test = [], [] for x in test_texts: sents_test.append(["[CLS]"] + x + ["[SEP]"]) for y in test_labels: tags_li_test.append(["<PAD>"] + y + ["<PAD>"]) train_dataset = NerDataset(sents_train, tags_li_train) eval_dataset = NerDataset(sents_valid, tags_li_valid) test_dataset = NerDataset(sents_test, tags_li_test) train_iter = torch.utils.data.DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle=True, collate_fn=pad, num_workers=0 ) eval_iter = torch.utils.data.DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=False, collate_fn = pad, num_workers=0 )
fout.write(f"precision={precision}\n") fout.write(f"recall={recall}\n") fout.write(f"f1={f1}\n") os.remove("temp_bert") print("precision=%.4f" % precision) print("recall=%.4f" % recall) print("f1=%.4f" % f1) return precision, recall, f1 model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).cuda() model = nn.DataParallel(model) train_dataset = NerDataset(hp.trainset) eval_dataset = NerDataset(hp.validset) train_iter = data.DataLoader(dataset=train_dataset, batch_size=hp.batch_size, shuffle=True, num_workers=4, collate_fn=pad) eval_iter = data.DataLoader(dataset=eval_dataset, batch_size=hp.batch_size, shuffle=False, num_workers=4, collate_fn=pad) optimizer = optim.Adam(model.parameters(), lr=hp.lr) criterion = nn.CrossEntropyLoss(ignore_index=0)
fout.write(f"precision={precision}\n") fout.write(f"recall={recall}\n") fout.write(f"f1={f1}\n") os.remove(f) print("precision=%.2f" % precision) print("recall=%.2f" % recall) print("f1=%.2f" % f1) return precision, recall, f1 if __name__ == "__main__": train_dataset = NerDataset("../input/train.tsv", 'bc5cdr') # here bc5cdr is dataset type eval_dataset = NerDataset("../input/test.tsv", 'bc5cdr') hp = HParams('bc5cdr') # Define model #config = BertConfig(vocab_size_or_config_json_file=config.BERT_CONFIG_FILE) model = Net(config=config.BERT_CONFIG_FILE, weight=config.BERT_WEIGHTS, vocab_len=len(hp.VOCAB), device=hp.device) if torch.cuda.is_available(): model.cuda() model.train() # update with already pretrained weight