def experiment(self, seeds, epoch_num, learning_rate): for seed in seeds: torch.manual_seed(seed) random.seed(seed) novel_tokens = [] for experiment in self.experiments: novel_tokens += experiment.novel_tokens train_data = [] for experiment in self.experiments: train_data += experiment.train_data model = BERT(novel_tokens, learning_rate) print('Training') for epoch in range(epoch_num): model.model.train() model.optimizer.zero_grad() loss = model.get_loss(train_data) loss.backward() model.optimizer.step() print('loss:', loss.item()) for experiment in self.experiments: experiment.run(model) pickle.dump(self.experiments, open(self.save_name + '.pkl', 'wb'))
vocab_size, train_loader, fine_tuning_loader, validation_loader = preprocess_GPT(gpt_hyperparams) else: print("loading data for BERT model") vocab_size, train_loader, fine_tuning_loader, validation_loader = preprocess_BERT(bert_hyperparams) #create model if run_GPT: model = GPT(device=device, seq_len=gpt_hyperparams["seq_len"], num_words=vocab_size, d_model=gpt_hyperparams["d_model"], h= gpt_hyperparams["num_heads"], n=gpt_hyperparams["num_layers"]).to(device) if args.load: model.load_state_dict(torch.load('./gpt_model.pt')) else: model = BERT(device=device, seq_len=bert_hyperparams["seq_len"], num_words=vocab_size, d_model=bert_hyperparams["d_model"], h=bert_hyperparams["num_heads"], n=bert_hyperparams["num_layers"]).to(device) if args.load: model.load_state_dict(torch.load('./bert_model.pt')) #train model if args.train: print("training model ...") if run_GPT: train_GPT(model, train_loader, gpt_hyperparams) if args.save: torch.save(model.state_dict(), './gpt_model.pt') else: train_BERT(model, train_loader, bert_hyperparams) if args.save: torch.save(model.state_dict(), './bert_model.pt')
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model model = None log.info('Building model...') max_context_len, max_question_len = args.para_limit, args.ques_limit if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"): model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=0) elif (args.model_type == "dcn" or args.model_type == "dcn-bidaf"): model = DCN(word_vectors=word_vectors, hidden_size=args.hidden_size, max_context_len=max_context_len, max_question_len=max_question_len, drop_prob=0) elif (args.model_type == "bert-basic"): model = BERT(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=0) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_embeddings = get_embeddings(args.split, ids, args.para_limit, args.ques_limit) else: bert_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_embeddings, \ max_context_len, max_question_len,device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) # Comment out to only use 1 GPU on nv12 args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = None max_context_len, max_question_len = args.para_limit, args.ques_limit if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"): model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.model_type == "dcn" or args.model_type == "bert-dcn"): model = DCN(word_vectors=word_vectors, hidden_size=args.hidden_size, max_context_len=max_context_len, max_question_len=max_question_len, drop_prob=args.drop_prob) elif (args.model_type == "bert-basic"): model = BERT(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) if model is None: raise ValueError('Model is unassigned. Please ensure --model_type \ chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) count_skip = 0 while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: batch_size = cw_idxs.size(0) count_skip += 1 if (args.skip_examples == True and (count_skip % 5 == 1 or count_skip % 5 == 2 or count_skip % 5 == 3 or count_skip % 5 == 4)): step += batch_size progress_bar.update(batch_size) steps_till_eval -= batch_size continue # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_train_embeddings = get_embeddings( "train", ids, args.para_limit, args.ques_limit) else: bert_train_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(**kwargs): # The received parameters will be used to update configuration dict opt.parse(kwargs) # Step 0: data and device inputs = get_inputs(data_dir=opt.data_dir, corpus_file=opt.corpus_file, vocab_file=opt.vocab_path) train_dataloader = DataLoader(dataset=BERTDataset( inputs, max_sen_len=opt.max_sen_len), shuffle=True, batch_size=opt.batch_size, collate_fn=BERTCollate_fn) use_cuda = True if opt.use_cuda and torch.cuda.is_available() else False if use_cuda: torch.cuda.empty_cache() device = torch.device('cuda' if use_cuda else 'cpu') writer = SummaryWriter() # Step 1: model bert = BERT(n_layers=opt.n_layers, d_model=opt.d_model, vocab_size=opt.max_vocab_size, max_len=opt.max_sen_len, n_heads=opt.n_heads, n_seg=opt.n_seg, ff_hidden=opt.n_ff_hidden, device=device).to(device) masked_lm = MaskedLM(d_model=opt.d_model, vocab_size=opt.max_vocab_size, bert=bert).to(device) next_pred = NextPred(d_model=opt.d_model).to(device) # Write model dummy_input_ids = torch.zeros( (opt.batch_size, opt.max_sen_len)).long().to(device) dummy_seg_ids = torch.zeros( (opt.batch_size, opt.max_sen_len)).long().to(device) writer.add_graph(bert, (dummy_input_ids, dummy_seg_ids), False) # dummy_bertout = torch.zeros((opt.batch_size, opt.max_sen_len, opt.d_model)).long().to(device) # dummy_masked_pos = torch.zeros((opt.batch_size, opt.max_mask_len)).long().to(device) # # writer.add_graph(masked_lm, (dummy_bertout, dummy_masked_pos), True) # writer.add_graph(next_pred, (dummy_bertout), True) # Step 2: criterion and optimizer criterion = nn.CrossEntropyLoss() num_paras = sum(p.numel() for model in (bert, masked_lm, next_pred) for p in model.parameters() if p.requires_grad) paras = list(bert.parameters()) + list(masked_lm.parameters()) + list( next_pred.parameters()) print("Total number of parameters is {}".format(num_paras)) optimizer = torch.optim.Adam(paras, lr=0.0001, betas=(0.9, 0.999), weight_decay=0.01) # Step 3: train print("Start training ...") for epoch in range(opt.epochs): epoch_loss = 0 for i, batch_data in enumerate(train_dataloader, 1): input_ids, seg_ids, masked_pos, masked_token, isnext = map( lambda x: x.to(device), batch_data) # Reset gradients and forward optimizer.zero_grad() bertout = bert(input_ids, seg_ids) logits_lm = masked_lm(bertout, masked_pos) logits_clsf = next_pred(bertout) # Compute loss logits_lm = logits_lm.view( -1, logits_lm.size(-1)) # (bz * len_mask, vocab) masked_token = masked_token.view(-1, ) # (bz * len_mask, ) logits_clsf = logits_clsf.view(-1, logits_clsf.size(-1)) # (bz, ) isnext.view(-1, ) # (bz, ) loss_lm = criterion(logits_lm, masked_token) loss_clsf = criterion(logits_clsf, isnext) loss = loss_lm + loss_clsf _, mask_preds = torch.max(logits_lm, dim=-1) _, next_preds = torch.max(logits_clsf, dim=-1) mask_pred_acc = mask_preds.eq( masked_token).sum().item() / masked_token.size(0) next_pred_acc = next_preds.eq(isnext).sum().item() / isnext.size(0) if i % 20 == 0: writer.add_scalar('loss_lm', loss_lm.item(), i + epoch * len(train_dataloader)) writer.add_scalar('loss_clsf', loss_clsf.item(), i + epoch * len(train_dataloader)) writer.add_scalar('lm_acc', mask_pred_acc, i + epoch * len(train_dataloader)) writer.add_scalar('next_acc', next_pred_acc, i + epoch * len(train_dataloader)) print( 'Epoch {}, Batch {}/{}, loss_lm={}, loss_next={}, lm_acc={}, next_acc={}' .format(epoch + 1, i, len(train_dataloader), loss_lm.item(), loss_clsf.item(), mask_pred_acc, next_pred_acc)) epoch_loss += loss.item() # Backward and update loss.backward() optimizer.step() if (1 + epoch) % 1 == 0: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(epoch_loss)) print('finished train') # Step 4: Save model ckpt_file_name = dt.strftime(dt.now(), '%Y-%m-%d %H: %M: %S.ckpt') save_path = os.path.join(opt.ckpt_path, ckpt_file_name) torch.save(bert.state_dict(), save_path)
def finetune(ckpt_file=None): if ckpt_file is None: files = os.listdir('checkpoints') if len(files): ckpt_file = files[-1] ckpt_path = os.path.join('checkpoints', ckpt_file) # Step 0: data and device inputs = get_inputs(data_dir=opt.data_dir, corpus_file=opt.corpus_file, vocab_file=opt.vocab_path) train_dataloader = DataLoader(dataset=BERTDataset( inputs, max_sen_len=opt.max_sen_len), shuffle=True, batch_size=opt.batch_size, collate_fn=BERTCollate_fn) use_cuda = True if opt.use_cuda and torch.cuda.is_available() else False if use_cuda: torch.cuda.empty_cache() device = torch.device('cuda' if use_cuda else 'cpu') writer = SummaryWriter(comment='finetune') # Step 1; model bert = BERT(n_layers=opt.n_layers, d_model=opt.d_model, vocab_size=opt.max_vocab_size, max_len=opt.max_sen_len, n_heads=opt.n_heads, n_seg=opt.n_seg, ff_hidden=opt.n_ff_hidden, device=device).to(device) bert.load_state_dict(torch.load(ckpt_path)) clf = NextSenCLF(d_model=opt.d_model, bert=bert).to(device) # Step: criterion and optimizers criterion = nn.CrossEntropyLoss() optimizer1 = torch.optim.Adam(bert.parameters(), lr=1e-8) optimizer2 = torch.optim.Adam(clf.clf.parameters(), lr=0.001) # Step 3: training for epoch in range(max_epochs): epoch_loss = 0 for i, batch_data in enumerate(train_dataloader, 1): input_ids, seg_ids, _, _, isnext = map(lambda x: x.to(device), batch_data) # Reset gradients and forward optimizer1.zero_grad() optimizer2.zero_grad() clfout = clf(input_ids, seg_ids) # Compute loss clfout = clfout.view(-1, clfout.size(-1)) isnext = isnext.view(-1, ) loss = criterion(clfout, isnext) _, next_preds = torch.max(clfout, dim=-1) next_pred_acc = next_preds.eq(isnext).sum().item() / isnext.size(0) if i % 5 == 0: writer.add_scalar('clf_loss', loss.item(), i + epoch * len(train_dataloader)) writer.add_scalar('clf_acc', next_pred_acc, i + epoch * len(train_dataloader)) print('Epoch {}, Batch {}/{}, clf_loss={}, clf_acc={}'.format( epoch + 1, i, len(train_dataloader), loss.item(), next_pred_acc)) epoch_loss += loss.item() # Backward and update loss.backward() optimizer1.step() optimizer2.step()
batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=args.device, train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test_dataset, batch_size=args.batch_size, device=args.device, train=False, shuffle=False, sort=False) logger.info("Initializations done") model = BERT().to(args.device) optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=2e-6) logger.info("Begin Training") train(model=model, optimizer=optimizer, device=args.device, train_loader=train_iter, valid_loader=valid_iter, num_epochs=args.epochs, eval_every=args.eval_every, ckpt_dir=args.ckpt_dir) print("Training over") best_model = BERT().to(args.device)
model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, n_labels, args.dropout, pad_id, corpus, no_glove=args.no_glove, freeze=args.freeze, bidirectional=True).to(my_device) elif args.model == 'lstm_crf': model = LSTMCRF(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, n_labels, args.dropout, pad_id, corpus, no_glove=args.no_glove, freeze=args.freeze, bidirectional=False).to(my_device) elif args.model == 'bilstm_crf': model = LSTMCRF(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, n_labels, args.dropout, pad_id, corpus, no_glove=args.no_glove, freeze=args.freeze, bidirectional=True).to(my_device) elif 'bert' in args.model: model = BERT(n_labels, corpus, seq2seq, args.model, args.dropout).to(my_device) # freeze bert's pretrained parameters #for param in model.encoder.bert.parameters(): # param.requires_grad=False else: raise 'Choose a model among the five options.' # choose minimization method #if 'bert' in args.model: # update only classifier model and use fixed pretrained models # https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/ # param_optimizer = list(model.encoder.classifier.named_parameters()) # optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] # optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.lr) #else: optimizer = optim.Adam(model.parameters(), lr=args.lr)