def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the vsm-based normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) logging.info("alphabet size {}".format(word_alphabet.size())) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) logging.info("init_vector_for_dict") poses, poses_lengths = init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) vsm_model = VsmNormer(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses, poses_lengths) logging.info("generate instances for training ...") train_X = [] train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = generate_instances(doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = generate_instances_ehr(doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) train_X.extend(temp_X) train_Y.extend(temp_Y) train_loader = DataLoader(MyDataset(train_X, train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': dict_pretrain(dictionary, dictionary_reverse, d, isMeddra_dict, optimizer, vsm_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() vsm_model.train() train_iter = iter(train_loader) num_iter = len(train_loader) sum_loss = 0 correct, total = 0, 0 for i in range(num_iter): x, lengths, y = next(train_iter) l, y_pred = vsm_model.forward_train(x, lengths, y) sum_loss += l.item() l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) optimizer.step() vsm_model.zero_grad() total += y.size(0) _, pred = torch.max(y_pred, 1) correct += (pred == y).sum().item() epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % (idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if opt.dev_file: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, None, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if len(opt.dev_file) == 0: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) return best_dev_p, best_dev_r, best_dev_f
def train(train_data, dev_data, test_data, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("build alphabet ...") dict_alphabet = Alphabet('dict') init_dict_alphabet(dict_alphabet, dictionary) dict_alphabet.close() train_X = [] train_Y = [] for doc in train_data: temp_X, temp_Y = generate_instances_ehr(doc.entities, dict_alphabet, dictionary_reverse) train_X.extend(temp_X) train_Y.extend(temp_Y) train_loader = DataLoader(MyDataset(train_X, train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) if opt.gpu >= 0 and torch.cuda.is_available(): device = torch.device('cuda', opt.gpu) else: device = torch.device('cpu') model, _ = BertForSequenceClassification.from_pretrained( opt.bert_dir, target=dict_alphabet) model.dict_alphabet = dict_alphabet model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() model.train() train_iter = iter(train_loader) num_iter = len(train_loader) sum_loss = 0 correct, total = 0, 0 for i in range(num_iter): x, mask, sentences, y, _, tokens_ent, mask_ent = next(train_iter) _, y_pred = model.forward(x, sentences, mask, tokens_ent, mask_ent) l = model.loss(y_pred, y) sum_loss += l.item() l.backward() optimizer.step() model.zero_grad() total += y.size(0) _, pred = torch.max(y_pred, 1) correct += (pred == y).sum().item() epoch_finish = time.time() accuracy = 100.0 * correct / total logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f Accuracy %.2f" % (idx, epoch_finish - epoch_start, sum_loss / num_iter, accuracy)) if opt.dev_file: p, r, f = evaluate(dev_data, dictionary, dictionary_reverse, model) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) torch.save(model, os.path.join(opt.save, "norm_neural.pkl")) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") return best_dev_p, best_dev_r, best_dev_f
def pretrain(opt): samples_per_epoch = [] pregenerated_data = Path(opt.instance_dir) for i in range(opt.iter): epoch_file = pregenerated_data / f"epoch_{i}.json" metrics_file = pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = opt.iter if opt.gpu >= 0 and torch.cuda.is_available(): if opt.multi_gpu: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device('cuda', opt.gpu) n_gpu = 1 else: device = torch.device("cpu") n_gpu = 0 logging.info("device: {} n_gpu: {}".format(device, n_gpu)) if opt.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(opt.gradient_accumulation_steps)) opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps makedir_and_clear(opt.save) tokenizer = BertTokenizer.from_pretrained(opt.bert_dir, do_lower_case=opt.do_lower_case) total_train_examples = 0 for i in range(opt.iter): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / opt.batch_size / opt.gradient_accumulation_steps) logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict) logging.info("dict concept number {}".format(len(UMLS_dict))) dict_alphabet = Alphabet('dict') init_dict_alphabet(dict_alphabet, UMLS_dict) dict_alphabet.close() # Prepare model model, _ = BertForPreTraining.from_pretrained( opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, warmup=opt.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", opt.batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(opt.iter): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, dict_alphabet=dict_alphabet) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=opt.batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 epoch_start = time.time() sum_loss = 0 sum_orginal_loss = 0 num_iter = len(train_dataloader) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch loss, original_loss = model(input_ids, segment_ids, input_mask, lm_label_ids, input_ids_ent, input_mask_ent, is_next, norm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. original_loss = original_loss.mean() if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps original_loss = original_loss / opt.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % opt.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 sum_loss += loss.item() sum_orginal_loss += original_loss.item() epoch_finish = time.time() logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f" % (epoch, epoch_finish - epoch_start, sum_loss / num_iter, sum_orginal_loss / num_iter)) # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( opt.save, "pytorch_model_{}.bin".format(str(epoch + 1))) torch.save(model_to_save.state_dict(), str(output_model_file))
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the ensemble normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) # rule logging.info("init rule-based normer") multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse, isMeddra_dict) if opt.ensemble == 'learn': logging.info("init ensemble normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses) if pretrain_neural_model is not None: ensemble_model.neural_linear.weight.data.copy_( pretrain_neural_model.linear.weight.data) if pretrain_vsm_model is not None: ensemble_model.vsm_linear.weight.data.copy_( pretrain_vsm_model.linear.weight.data) ensemble_train_X = [] ensemble_train_Y = [] for doc in train_data: temp_X, temp_Y = generate_instances(doc, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict) ensemble_train_X.extend(temp_X) ensemble_train_Y.extend(temp_Y) ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X, ensemble_train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(ensemble_model.word_embedding) else: # vsm logging.info("init vsm-based normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) # alphabet can share between vsm and neural since they don't change # but word_embedding cannot vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet, poses) vsm_train_X = [] vsm_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = vsm.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = vsm.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) vsm_train_X.extend(temp_X) vsm_train_Y.extend(temp_Y) vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y), opt.batch_size, shuffle=True, collate_fn=vsm.my_collate) vsm_optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': vsm.dict_pretrain(dictionary, dictionary_reverse, d, True, vsm_optimizer, vsm_model) # neural logging.info("init neural-based normer") neural_model = norm_neural.NeuralNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet) neural_train_X = [] neural_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = norm_neural.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = norm_neural.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) neural_train_X.extend(temp_X) neural_train_Y.extend(temp_Y) neural_train_loader = DataLoader(norm_neural.MyDataset( neural_train_X, neural_train_Y), opt.batch_size, shuffle=True, collate_fn=norm_neural.my_collate) neural_optimizer = optim.Adam(neural_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(neural_model.word_embedding) if d.config['norm_neural_pretrain'] == '1': neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True, neural_optimizer, neural_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() if opt.ensemble == 'learn': ensemble_model.train() ensemble_train_iter = iter(ensemble_train_loader) ensemble_num_iter = len(ensemble_train_loader) for i in range(ensemble_num_iter): x, rules, lengths, y = next(ensemble_train_iter) y_pred = ensemble_model.forward(x, rules, lengths) l = ensemble_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), opt.gradient_clip) ensemble_optimizer.step() ensemble_model.zero_grad() else: vsm_model.train() vsm_train_iter = iter(vsm_train_loader) vsm_num_iter = len(vsm_train_loader) for i in range(vsm_num_iter): x, lengths, y = next(vsm_train_iter) l, _ = vsm_model.forward_train(x, lengths, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) vsm_optimizer.step() vsm_model.zero_grad() neural_model.train() neural_train_iter = iter(neural_train_loader) neural_num_iter = len(neural_train_loader) for i in range(neural_num_iter): x, lengths, y = next(neural_train_iter) y_pred = neural_model.forward(x, lengths) l = neural_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(neural_model.parameters(), opt.gradient_clip) neural_optimizer.step() neural_model.zero_grad() epoch_finish = time.time() logging.info("epoch: %s training finished. Time: %.2fs" % (idx, epoch_finish - epoch_start)) if opt.dev_file: if opt.ensemble == 'learn': # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item())) p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, None, None, ensemble_model, d, isMeddra_dict) else: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, neural_model, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if opt.ensemble == 'learn': if fold_idx is None: torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save( ensemble_model, os.path.join(opt.output, "ensemble_{}.pkl".format(fold_idx + 1))) else: if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) torch.save( neural_model, os.path.join(opt.output, "norm_neural_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if fold_idx is None: multi_sieve.finalize(True) else: if fold_idx == opt.cross_validation - 1: multi_sieve.finalize(True) else: multi_sieve.finalize(False) if len(opt.dev_file) == 0: if opt.ensemble == 'learn': torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) return best_dev_p, best_dev_r, best_dev_f