def on_valid_end(self, eval_result, metric_key, optimizer, better_result): if better_result: eval_result = deepcopy(eval_result) eval_result['step'] = self.step eval_result['epoch'] = self.epoch fitlog.add_best_metric(eval_result) fitlog.add_metric(eval_result, step=self.step, epoch=self.epoch) if len(self.testers) > 0: for key, tester in self.testers.items(): try: eval_result = tester.test() if self.verbose != 0: self.pbar.write( "FitlogCallback evaluation on {}:".format(key)) self.pbar.write( tester._format_eval_results(eval_result)) fitlog.add_metric(eval_result, name=key, step=self.step, epoch=self.epoch) if better_result: fitlog.add_best_metric(eval_result, name=key) except Exception as e: self.pbar.write( "Exception happens when evaluate on DataSet named `{}`." .format(key)) raise e
def on_valid_end(self, eval_result, metric_key, optimizer, better_result): if better_result: eval_result = deepcopy(eval_result) eval_result['step'] = self.step eval_result['epoch'] = self.epoch fitlog.add_best_metric(eval_result) fitlog.add_metric(eval_result, step=self.step, epoch=self.epoch) if better_result: for key, eval_result in self._save_metrics.items(): fitlog.add_best_metric(eval_result, name=key)
def train(model , train_data , test_data): train_iter = DataSetIter(train_data , batch_size = C.batch_size) test_iter = DataSetIter(test_data , batch_size = C.batch_size) loss_func = nn.CrossEntropyLoss(ignore_index = 0) optim = tc.optim.Adam(params = model.parameters() , lr = C.lr , weight_decay = C.weight_decay) scheduler = get_cosine_schedule_with_warmup( optim , num_warmup_steps = C.warmup , num_training_steps = train_iter.num_batches * C.epoch_number , ) best_test_loss = -1 best_test_epoch = -1 best_step = -1 try: for epoch_n in range(C.epoch_number): tra_loss = run(model , train_iter , loss_func , epoch_n , optim , scheduler , True) tes_loss = run(model , test_iter , loss_func , epoch_n , None , None , False) logger.log ("Epoch %d ended. Train loss = %.4f , Valid loss = %.4f" % ( epoch_n , tra_loss , tes_loss , )) fitlog.add_metric( tes_loss , step = train_iter.num_batches * (epoch_n + 1) , epoch = epoch_n , name = "valid loss" ) if best_test_epoch < 0 or tes_loss < best_test_loss: best_test_loss = tes_loss best_test_epoch = epoch_n best_step = fitlog_loss_step["train loss"] fitlog.add_best_metric(best_test_loss , name = "loss") with open(C.model_save , "wb") as fil:#暂时保存目前最好的模型 pickle.dump(model , fil) fitlog.add_hyper(name = "best_step" , value = "%d / %d" % ( best_step , train_iter.num_batches * C.epoch_number , )) except KeyboardInterrupt: # 手动提前停止 pass logger.log ("Train end.") logger.log ("Got best valid loss %.4f in epoch %d" % (best_test_loss , best_test_epoch)) return model
def valid_one_epoch(fold, epoch, model, criterion, optimizer, dataloader, device, scheduler=None): global min_loss, max_dice, save_dir model.eval() img_labels, img_preds = [], [] total_loss, length = .0, 0 pbar = tqdm(dataloader) for step, (imgs, labels) in enumerate(pbar): imgs = imgs.to(device) labels = labels.to(device) preds = model(imgs) img_preds.append(preds.sigmoid()) img_labels.append(labels) preds = torch.cat(img_preds) labels = torch.cat(img_labels).type_as(preds) dice = calc_dice(preds, labels, args.thersh) fitlog.add_metric({"val": {f"fold_{fold}_dice": dice}}, step=epoch) if not save_dir: save_dir = fitlog.get_log_folder(absolute=True) # 将模型保存在对应fitlog文件夹下 if dice > max_dice: max_dice = dice fitlog.add_best_metric({"val": {f"fold_{fold}_dice": max_dice}}) torch.save( model.state_dict(), f'{save_dir}/{args.structure}_{args.encoder}_fold{fold}_best.pth') print(f'fold {fold} epoch {epoch}, valid dice {dice:.4f}')
def valid_one_epoch(fold, epoch, model, criterion, optimizer, dataloader, device, scheduler=None): global min_loss, max_acc, save_dir model.eval() img_labels, img_preds = [], [] total_loss, length = .0, 0 pbar = tqdm(dataloader) for step, (imgs, labels) in enumerate(pbar): imgs = imgs.to(device) labels = labels.to(device) preds = model(imgs) # img_preds.append(torch.argmax(preds, dim=1).detach().cpu().numpy()) img_preds.append((preds.sigmoid() > 0.5).detach().cpu().numpy()) img_labels.append(labels.detach().cpu().numpy()) img_preds = np.concatenate(img_preds) img_labels = np.concatenate(img_labels) acc = (img_preds == img_labels).mean() fitlog.add_metric({"val": {f"fold_{fold}_acc": acc}}, step=epoch) save_dir = fitlog.get_log_folder(absolute=True) if acc > max_acc: max_acc = acc fitlog.add_best_metric({"val": {f"fold_{fold}_acc": max_acc}}) torch.save(model.state_dict(), f'{save_dir}/{args.model}_fold{fold}_best.pth') print(f'fold {fold} epoch {epoch}, valid acc {acc:.4f}')
def valid_one_epoch(fold, epoch, model, loss_fn, val_loader, device, scheduler=None, schd_loss_update=False): global max_acc, min_loss model.eval() t = time.time() loss_sum = 0 sample_num = 0 image_preds_all = [] image_targets_all = [] pbar = tqdm(enumerate(val_loader), total=len(val_loader)) for step, (imgs, image_labels) in pbar: imgs = imgs.to(device).float() image_labels = image_labels.to(device).long() image_preds = model(imgs) #output = model(input) #print(image_preds.shape, exam_pred.shape) image_preds_all += [ torch.argmax(image_preds, 1).detach().cpu().numpy() ] image_targets_all += [image_labels.detach().cpu().numpy()] loss = loss_fn(image_preds, image_labels) loss_sum += loss.item() * image_labels.shape[0] sample_num += image_labels.shape[0] if ((step + 1) % args.verbose == 0) or ((step + 1) == len(val_loader)): description = f'epoch {epoch} loss: {loss_sum/sample_num:.4f}' pbar.set_description(description) image_preds_all = np.concatenate(image_preds_all) image_targets_all = np.concatenate(image_targets_all) acc = (image_preds_all == image_targets_all).mean() if loss_sum < min_loss: min_loss = loss_sum fitlog.add_best_metric({'loss': min_loss}) fitlog.add_best_metric({'loss_epoch': epoch}) if acc > max_acc: max_acc = acc fitlog.add_best_metric({'acc': max_acc}) fitlog.add_best_metric({'acc_epoch': epoch}) torch.save(model.state_dict(), '{}/{}_fold_{}_best'.format(save_dir, args.model, fold)) print('validation multi-class accuracy = {:.4f}'.format( (image_preds_all == image_targets_all).mean())) if scheduler is not None: if schd_loss_update: scheduler.step(loss_sum / sample_num) else: scheduler.step()
train_one_epoch(epoch, model, criterion, optimizer, train_loader, device, scheduler=scheduler, schd_batch_update=False) with torch.no_grad(): valid_one_epoch(fold, epoch, model, criterion, val_loader, device, scheduler=None, schd_loss_update=False) torch.save(model.state_dict(), '{}/{}_fold_{}_last'.format(save_dir, args.model, fold)) del model, optimizer, train_loader, val_loader, scaler, scheduler torch.cuda.empty_cache() acc.append(max_acc) if len(acc) > 1: nfold = len(acc) fitlog.add_best_metric({str(nfold) + 'fold': np.mean(acc)}) fitlog.finish()
def _train(self, criterion, optimizer, train_data_loader, val_data_loader, test_data_loader): fitlog.add_hyper({ "model_name": self.opt.model_name, "dataset": self.opt.dataset, 'resplit': self.opt.resplit, "domain": self.opt.domain, "aug": self.opt.aug, "adv": self.opt.adv, "aux": self.opt.aux, "adv_aux": self.opt.adv_aux, 'chg': self.opt.chg }) max_val_acc = 0 max_val_f1 = 0 global_step = 0 last_model_path = None # model_path =None path = None pgd = PGD(self.model) k = 3 for epoch in range(self.opt.num_epoch): logger.info('>' * 100) logger.info('epoch: {}'.format(epoch)) n_correct, n_total, loss_total = 0, 0, 0 # switch model to training mode self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): global_step += 1 # clear gradient accumulators optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] if self.opt.model_name == 'bert_multi_target': targets = sample_batched['polarity'].to(self.opt.device) else: targets = sample_batched['polarity'].to(self.opt.device) if self.opt.model_name in reg_list: aux_cls_logeits, outputs, reg_can_loss, reg_aux_loss, bert_word_output, reg_chg_loss = self.model( inputs, None) else: outputs = self.model(inputs) reg_can_loss = 0 reg_aux_loss = 0 reg_chg_loss = 0 # print('outputs',outputs.shape) # print('targets',targets.shape) # print(outputs,'outputs') # print(targets,'polarity') loss_1 = criterion(outputs, targets) loss_2 = reg_can_loss loss_3 = reg_aux_loss loss_4 = reg_chg_loss weighted_loss_2 = loss_2 * self.opt.can weighted_loss_3 = loss_3 * self.opt.aux weighted_loss_4 = loss_4 * self.opt.chg loss = 1 * loss_1 + weighted_loss_2 + weighted_loss_3 + weighted_loss_4 if self.opt.adv > 0: # print(inputs.shape) if self.opt.adv_aux == 1: loss_adv = self._loss_adv(weighted_loss_3, bert_word_output, criterion, inputs, targets, p_mult=self.opt.adv) else: loss_adv = self._loss_adv(loss, bert_word_output, criterion, inputs, targets, p_mult=self.opt.adv) loss += loss_adv else: loss_adv = 0 loss.backward() # pgd.backup_grad() # for t in range(K): # pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data # if t != K-1: # model.zero_grad() # else: # pgd.restore_grad() # loss_adv = model(batch_input, batch_label) # loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 # pgd.restore() # 恢复embedding参数 optimizer.step() n_correct += (torch.argmax(outputs, -1) == targets).sum().item() # print(outputs.shape) # n_correct += (torch.argmax(aux_cls_logeits, -1) == 4*targets).sum().item() n_total += len(outputs) loss_total += loss.item() * len(outputs) if global_step % self.opt.log_step == 0: train_acc = n_correct / n_total train_loss = loss_total / n_total logger.info( 'loss_total: {:.4f}, acc: {:.4f},loss_main: {:.4f},reg_can_loss: {:.4f},loss_adv: {:.4f},reg_aux_loss {:.4f},reg_chg_loss {:.4f}' .format(train_loss, train_acc, loss_1, weighted_loss_2, loss_adv, weighted_loss_3, weighted_loss_4)) fitlog.add_metric( { "Train": { 'loss_total: {:.4f}, acc: {:.4f},loss_main: {:.4f},reg_can_loss: {:.4f},loss_adv: {:.4f},reg_aux_loss {:.4f},reg_chg_loss {:.4f}' .format(train_loss, train_acc, loss_1, weighted_loss_2, loss_adv, weighted_loss_3, weighted_loss_4) } }, step=global_step) val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader) test_acc, test_f1 = self._evaluate_acc_f1(test_data_loader) logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format( val_acc, val_f1)) logger.info('> test_acc: {:.4f}, test_f1: {:.4f}'.format( test_acc, test_f1)) if val_acc > max_val_acc: max_val_acc = val_acc if not os.path.exists('state_dict'): os.mkdir('state_dict') model_path = 'state_dict/{0}_{1}_doamin-{2}_can{3}_aug{4}_adv{5}_aux{6}_val_acc{7}_resplit{8}'.format( self.opt.model_name, self.opt.dataset, self.opt.domain, self.opt.can, self.opt.aug, self.opt.adv, self.opt.aux, round(val_acc, 4), self.opt.resplit) bert_path = 'state_dict/{0}_{1}_doamin-{2}_can{3}_aug{4}_adv{5}_aux{6}_val_acc{7}_resplit{8}_bert'.format( self.opt.model_name, self.opt.dataset, self.opt.domain, self.opt.can, self.opt.aug, self.opt.adv, self.opt.aux, round(val_acc, 4), self.opt.resplit) # fitlog.add_hyper({"model_name":self.opt.model_name,"dataset":self.opt.dataset,'resplit':self.opt.resplit,"domain":self.opt.domain,"aug":self.opt.aug,"adv":self.opt.adv,"aux":self.opt.aux}) fitlog.add_metric( {"val": { "val_acc": val_acc, "val_f1": val_f1 }}, step=global_step) fitlog.add_metric( {"test": { "test_acc": test_acc, "test_f1": test_f1 }}, step=global_step) fitlog.add_best_metric( {"val": { "val_acc": val_acc, "val_f1": val_f1 }}) fitlog.add_best_metric( {"test": { "test_acc": test_acc, "test_f1": test_f1 }}) if last_model_path != None: os.remove(last_model_path) if self.opt.model_name not in ['lcf_bert']: os.remove(last_bert_path) last_model_path = model_path last_bert_path = bert_path torch.save(self.model.state_dict(), model_path) if self.opt.model_name not in ['lcf_bert']: torch.save(self.model.bert.state_dict(), bert_path) logger.info('>> saved: {}'.format(model_path)) # max_val_f1 = val_f1 if val_f1 > max_val_f1: max_val_f1 = val_f1 # fitlog.add_metric(acc,name="Acc",step=step) return model_path
def fit(self): last_miou = .0 # record the best validation mIoU loss_step = 0 # step count for epoch in range(self.conf.epochs): train_loss = .0 start = time.time() for i, (data, target) in enumerate(self.train_iter): gpu_datas = split_and_load(data, ctx_list=self.ctx) gpu_targets = split_and_load(target, ctx_list=self.ctx) with autograd.record(): loss_gpu = [ self.criterion(*self.net(gpu_data), gpu_target) for gpu_data, gpu_target in zip( gpu_datas, gpu_targets) ] for loss in loss_gpu: autograd.backward(loss) self.trainer.step(self.conf.bs_train) nd.waitall() loss_temp = .0 for losses in loss_gpu: loss_temp += losses.sum().asscalar() train_loss += (loss_temp / self.conf.bs_train) # log every n batch # add loss to draw curve, train_loss <class numpy.float64> interval = 5 if loss_step < 5000 else 50 if (i % interval == 0) or (i + 1 == len(self.train_iter)): fitlog.add_loss(name='loss', value=round(train_loss / (i + 1), 5), step=loss_step) loss_step += 1 self.logger.info( "Epoch %d, batch %d, training loss %.5f." % (epoch, i, train_loss / (i + 1))) # log each epoch self.logger.info( ">>>>>> Epoch %d complete, time cost: %.1f sec. <<<<<<" % (epoch, time.time() - start)) # validation each epoch if self.val: pixel_acc, mean_iou = self._validation() self.logger.info( "Epoch %d validation, PixelAccuracy: %.4f, mIoU: %.4f." % (epoch, pixel_acc, mean_iou)) fitlog.add_metric(value=mean_iou, step=epoch, name='mIoU') fitlog.add_metric(value=pixel_acc, step=epoch, name='PA') if mean_iou > last_miou: f_name = self._save_model(tag='best') self.logger.info( "Epoch %d mIoU: %.4f > %.4f(previous), save model: %s" % (epoch, mean_iou, last_miou, f_name)) last_miou = mean_iou # save the final-epoch params f_name = self._save_model(tag='last') self.logger.info(">>>>>> Training complete, save model: %s. <<<<<<" % f_name) # record fitlog.add_best_metric(value=round(last_miou, 4), name='mIoU') fitlog.add_other(value=self.id, name='record_id') fitlog.add_other(value=self.num_train, name='train') fitlog.add_other(value=self.num_val, name='val')
def _train(self, criterion, optimizer): max_test_acc = 0 max_test_f1 = 0 global_step = 0 continue_not_increase = 0 for epoch in range(self.opt.num_epoch): print(">" * 100) print("epoch: ", epoch) n_correct, n_total = 0, 0 increase_flag = False for i_batch, sample_batched in enumerate(self.train_data_loader): global_step += 1 # switch model to training mode, clear gradient accumulators self.model.train() optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] targets = sample_batched["polarity"].to(self.opt.device) outputs = self.model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() if global_step % self.opt.log_step == 0: n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) train_acc = n_correct / n_total test_acc, test_f1 = self._evaluate_acc_f1() ################fitlog code#################### fitlog.add_metric(test_acc, name="acc", step=global_step) fitlog.add_metric(test_f1, name="f1", step=global_step) ################fitlog code#################### if test_acc > max_test_acc: increase_flag = True fitlog.add_best_metric(test_acc, "acc") max_test_acc = test_acc if test_f1 > max_test_f1: increase_flag = True max_test_f1 = test_f1 fitlog.add_best_metric(max_test_f1, "f1") if self.opt.save and test_f1 > self.global_f1: self.global_f1 = test_f1 torch.save( self.model.state_dict(), "state_dict/" + self.opt.model_name + "_" + self.opt.dataset + ".pkl", ) print(">>> best model saved.") print( "loss: {:.4f}, acc: {:.4f}, test_acc: {:.4f}, test_f1: {:.4f}" .format(loss.item(), train_acc, test_acc, test_f1)) if increase_flag == False: if continue_not_increase >= self.opt.early_stop: print("early stop.") break continue_not_increase += 1 else: continue_not_increase = 0 return max_test_acc, max_test_f1
def train(args, train_dataset, model, test_dataset): '''Train the model''' tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size train_sampler = RandomSampler(train_dataset) collate_fn = get_collate_fn(args) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs if args.embedding_type in ('bert', 'roberta'): optimizer = get_bert_optimizer(args, model) else: parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=args.learning_rate) # optimizer = torch.optim.SGD(parameters, lr=args.learning_rate, momentum=0.9) # Train logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) print("Total steps:", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 all_eval_results = [] model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") best_acc = 0 best_f1 = 0 # results, eval_loss = evaluate(args, test_dataset, model) with tqdm(total=args.num_train_epochs, desc='Epoch') as pbar: # for _ in train_iterator: for _ in range(int(args.num_train_epochs)): pbar.update() # epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs, labels = get_input_from_batch(args, batch) logit = model(**inputs) loss = F.cross_entropy(logit, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) tr_loss += loss.item() # if (step + 1) % args.gradient_accumulation_steps == 0: # scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 # Log metrics if args.logging_steps > 0 and global_step % args.logging_steps == 0: results, eval_loss = evaluate(args, test_dataset, model) all_eval_results.append(results) if results['acc']>best_acc: best_acc = results['acc'] best_f1 = results['f1'] pbar.write(f"Step:{global_step} acc:{round(best_acc, 4)}, f1:{round(best_f1, 4)}") fitlog.add_best_metric({'acc':best_acc, 'f1':best_f1, 'step':global_step}) fitlog.add_metric(name='f1', value=results['f1'], step=global_step) fitlog.add_metric(name='acc', value=results['acc'], step=global_step) # for key, value in results.items(): # tb_writer.add_scalar( # 'eval_{}'.format(key), value, global_step) # tb_writer.add_scalar('eval_loss', eval_loss, global_step) # # tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # tb_writer.add_scalar( # 'train_loss', (tr_loss - logging_loss) / args.logging_steps, global_step) # logging_loss = tr_loss # Save model checkpoint if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break tb_writer.close() return global_step, tr_loss/global_step, all_eval_results
def main(): args = set_config() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = RobertaTokenizer.from_pretrained('roberta-large') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Prepare model # encoder = BertForQuestionAnswering.from_pretrained(args.bert_model) # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # # encoder.to(device) # encoder.eval() # #freeze bert # # for name, param in model.named_parameters(): # # if "bert" in name: # # param.requires_grad = False # # model = GraphFusionNet(args) model = DFGN_Roberta.from_pretrained(r'E:\DATA\bert_pretrained\roberta-large', graph_config=args) model.to(device) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.do_train: # load train data train_examples, train_features, train_graph = get_train_feature(args, args.do_train, tokenizer) train_examples_dict = example_dict(train_examples) train_data = DataIteratorPack(train_features, train_examples_dict, train_graph, args.train_batch_size, device, sent_limit=25, entity_limit=80, n_layers=args.n_layers, sequential=False) # (features, example_dict, graph_dict, bsz, device, sent_limit, entity_limit, n_layers = 2, # entity_type_dict = None, sequential = False,) # load dev data eval_examples, eval_features, eval_graph = get_train_feature(args, not args.do_train, tokenizer) eval_examples_dict = example_dict(eval_examples) eval_data = DataIteratorPack(eval_features, eval_examples_dict, eval_graph, args.predict_batch_size, device, sent_limit=25, entity_limit=80, n_layers=args.n_layers, sequential=False) with open(args.predict_file) as f: gold = json.load(f) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) cur_patience = 0 VERBOSE_STEP = 100 best_dev_F1 = None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # model.train() model.train() total_train_loss = [0] * 5 for step, batch in enumerate(train_data): # batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids = batch["context_idxs"] input_mask = batch["context_mask"] segment_ids = batch["segment_idxs"] start, end, sp, Type, softmask, ent, yp1, yp2 = model(input_ids, segment_ids, input_mask, batch=batch, return_yp=True, is_train=True) loss_list = compute_loss(batch, start, end, sp, Type, softmask, args) if args.gradient_accumulation_steps > 1: loss_list = loss_list / args.gradient_accumulation_steps loss_list[0].backward() if (global_step + 1) % args.grad_accumulate_step == 0: optimizer.step() optimizer.zero_grad() global_step += 1 for i, l in enumerate(loss_list): if not isinstance(l, int): total_train_loss[i] += l.item() if global_step % VERBOSE_STEP == 0: print("-- In Epoch{}: ".format(epoch)) for i, l in enumerate(total_train_loss): print("Avg-LOSS{}/batch/step: {}".format(i, l / VERBOSE_STEP)) total_train_loss = [0] * 5 train_data.refresh() if args.do_predict: eval_examples_dict = example_dict(eval_examples) eval_features_dict = example_dict(eval_features) logger.info("***** Running predictions *****") logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] answer_dict = {} sp_dict = {} total_test_loss = [0] * 5 logger.info("Start evaluating") for step, batch in enumerate(eval_data): # batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids = batch["context_idxs"] input_mask = batch["context_mask"] segment_ids = batch["segment_idxs"] if len(sp_dict) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) with torch.no_grad(): start, end, sp, Type, softmask, ent, yp1, yp2 = model(input_ids, segment_ids, input_mask, batch=batch, return_yp=True) # context_encoding = encoder(input_ids, segment_ids, input_mask) # # # loss_list = model(context_encoding, batch=batch) # start, end, sp, Type, softmask, ent, yp1, yp2 = model(context_encoding, batch=batch, # return_yp=True) loss_list = compute_loss(batch, start, end, sp, Type, softmask, args) Type = Type.argmax(dim=1) # batch_start_logits, batch_end_logits, batch_types, sp = model(input_ids, segment_ids, input_mask, batch=batch) for i, l in enumerate(loss_list): if not isinstance(l, int): total_test_loss[i] += l.item() answer_dict_ = convert_to_tokens(eval_examples_dict, eval_features_dict, batch['ids'], yp1.data.cpu().numpy().tolist(), yp2.data.cpu().numpy().tolist(), Type.cpu().numpy()) answer_dict.update(answer_dict_) predict_support_np = torch.sigmoid(sp[:, :, 1]).data.cpu().numpy() for i in range(predict_support_np.shape[0]): cur_sp_pred = [] cur_id = batch['ids'][i] for j in range(predict_support_np.shape[1]): if j >= len(eval_examples_dict[cur_id].sent_names): break if predict_support_np[i, j] > args.sp_threshold: cur_sp_pred.append(eval_examples_dict[cur_id].sent_names[j]) sp_dict.update({cur_id: cur_sp_pred}) # for i, l in enumerate(total_train_loss): # print("Avg-LOSS{}/batch/step: {}".format(i, l / len(eval_features))) prediction = {'answer': answer_dict, 'sp': sp_dict} output_answer_sp_file = os.path.join(args.output_dir, "predictions_answer_sp_{}.json".format(epoch)) with open(output_answer_sp_file, 'w') as f: json.dump(prediction, f) # record results metrics = eval(prediction, gold) for i, l in enumerate(total_train_loss): metrics["LOSS{}".format(i)] = l / len(eval_features) print("Avg-LOSS{}/batch/step: {}".format(i, l / len(eval_features))) fitlog.add_best_metric({"Test": metrics}) metrics = evaluate(eval_examples_dict, answer_dict) print('hotpotqa | EM {:.4f} | F1 {:.4f}'.format(metrics['exact_match'], metrics['f1'])) eval_data.refresh() dev_F1 = metrics['f1'] if best_dev_F1 is None or dev_F1 > best_dev_F1: best_dev_F1 = dev_F1 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self logger.info("model save in %s" % output_model_file) # model_to_save.save_pretrained(output_model_file) # tokenizer.save_pretrained(args.output_dir) torch.save(model_to_save.state_dict(), output_model_file) cur_patience = 0 # model = AlbertForQuestionAnswering.from_pretrained(args.output_dir, force_download=True) # # tokenizer = AlbertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # model.to(device) else: cur_patience += 1 if cur_patience >= 3: # for param_group in optimizer.param_groups: # param_group['lr'] /= 2.0 # if param_group['lr'] < 1e-8: # stop_train = True break
batch_size = score.size(0) embedding_size = 300 score = score.cuda() data = data.cuda() predict = model(data) # print(predict.shape) loss = loss_function(predict, score) losss2.append(loss.item()) acc2 += (score == torch.argmax(predict, -1)).cpu().sum().item() total2 += score.size(0) print("dev epoch %s accuracy is %s loss is %s " % (str(i), str(acc2 / total2), str(np.mean(losss2)))) fitlog.add_metric(acc2 / total2, name="test_acc", step=i) if acc2 / total2 > best_acc: best_acc = acc2 / total2 fitlog.add_best_metric({"dev": best_acc}) losss2 = [] acc2 = 0 total2 = 0 for num, (score, data) in tqdm(enumerate(test_loader)): batch_size = score.size(0) embedding_size = 300 score = score.cuda() data = data.cuda() predict = model(data) loss = loss_function(predict, score) losss2.append(loss.item()) acc2 += (score == torch.argmax(predict, -1)).cpu().sum().item() total2 += score.size(0) # for each in zip(score,predict,data): # file.write(str(each)+"\n")
T_0=epochs, T_mult=1, eta_min=1e-6) scaler = GradScaler() for epoch in range(epochs): train_one_epoch(fold, epoch, model, criterion, optimizer, trn_loader, device, scheduler=scheduler) valid_one_epoch(fold, epoch, model, criterion, optimizer, val_loader, device, scheduler=None) print(f'fold {fold} max acc {max_acc}') acc.append(max_acc) if not args.nfold: break if args.nfold: fitlog.add_best_metric({"val": {"mean_acc": np.mean(acc)}}) fitlog.finish()
def main(): # ====== preprocess ====== # args = preprocess() # ====== Loading dataset ====== # train_data, dev_data, test_data, joint_vocabs, parsing_vocabs = load_data( args.joint_input, args.parsing_input, args.batch_size, args.accum_steps, args.shuffle, args.num_workers, args.drop_last) # cross_labels_idx = generate_cross_labels_idx(vocabs['labels']) # ======= Preparing Model ======= # print("\nModel Preparing starts...") model = JointEncoderModel( joint_vocabs, parsing_vocabs, # cross_labels_idx, # Embedding args.subword, args.use_pos_tag, args.bert_path, args.transliterate, args.d_model, args.partition, args.pos_tag_emb_dropout, args.position_emb_dropout, args.bert_emb_dropout, args.emb_dropout, # Encoder args.layer_num, args.hidden_dropout, args.attention_dropout, args.dim_ff, args.nhead, args.kqv_dim, # classifier args.label_hidden, # loss args.lambda_scaler, args.alpha_scaler, args.language, args.device).cuda() # print(model, end='\n\n\n') optimizer = Optim(model, args.optim, args.lr, args.lr_fine_tune, args.warmup_steps, args.lr_decay_factor, args.weight_decay, args.clip_grad, args.clip_grad_max_norm) optimizer.zero_grad() # if args.freeze_bert: # optimizer.set_freeze_by_idxs([str(num) for num in range(0, config.freeze_bert_layers)], True) # optimizer.free_embeddings() # optimizer.freeze_pooler() # print('freeze model of BERT %d layers' % config.freeze_bert_layers) # ========= Training ========= # print('Training starts...') start = time.time() steps, loss_value, total_batch_size = 1, 0., 0 best_dev, best_test = None, None patience = args.patience for epoch_i in range(1, args.epoch): for batch_i, insts in enumerate(train_data, start=1): model.train() insts, batch_size, max_len = batch_filter( insts, args.language, args.DATASET_MAX_SNT_LENGTH) insts_list = batch_spliter(insts, max_len, args.BATCH_MAX_SNT_LENGTH) total_batch_size += batch_size for insts in insts_list: loss = model(insts) if loss.item() > 0.: loss.backward() loss_value += loss.item() assert not isinstance(loss_value, torch.Tensor), 'GPU memory leak' if batch_i == args.accum_steps and not args.debug: args.visual_logger.visual_histogram(model, steps // args.accum_steps) if steps % args.accum_steps == 0: optimizer.step() optimizer.zero_grad() if steps % (args.accum_steps * args.log_interval) == 0: print('[%d/%d], [%d/%d] Loss: %.05f' % (epoch_i, args.epoch, batch_i // args.accum_steps, len(train_data) // args.accum_steps, loss_value / total_batch_size), flush=True) visual_dic = { 'loss/train': loss_value, 'lr': optimizer.get_lr()[0] } if args.clip_grad: visual_dic['norm'] = optimizer.get_dynamic_gard_norm() if not args.debug: args.visual_logger.visual_scalars( visual_dic, steps // args.accum_steps) loss_value, total_batch_size = 0., 0 torch.cuda.empty_cache() if steps % (args.accum_steps * args.eval_interval) == 0: print('model evaluating starts...', flush=True) joint_fscore_dev, res_data_dev = eval_model( model, dev_data, args.language, args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH, args.evalb_path, 'dev') joint_fscore_test, res_data_test = eval_model( model, test_data, args.language, args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH, args.evalb_path, 'test') visual_dic = { 'F/parsing_dev': joint_fscore_dev.parsing_f, 'F/parsing_test': joint_fscore_test.parsing_f, 'F/ner_dev': joint_fscore_dev.ner_f, 'F/ner_test': joint_fscore_test.ner_f } if not args.debug: args.visual_logger.visual_scalars( visual_dic, steps // args.accum_steps) if best_dev is None or joint_fscore_dev.parsing_f > best_dev.parsing_f: best_dev, best_test = joint_fscore_dev, joint_fscore_test fitlog.add_best_metric({ 'parsing_f_dev': best_dev.parsing_f, 'ner_f_test': best_test.ner_f }) patience = args.patience write_joint_data(args.save_path, res_data_dev, 'dev') write_joint_data(args.save_path, res_data_test, 'test') if args.save: torch.save( model.pack_state_dict(), os.path.join(args.save_path, args.name + '.best.model.pt')) print('best performance:\ndev: %s\ntest: %s' % (best_dev, best_test)) print('model evaluating ends...', flush=True) del res_data_dev, res_data_test if args.debug: exit(0) steps += 1 if args.early_stop: patience -= 1 if patience < 0: print('early stop') break # ====== postprocess ====== # postprocess(args, start)
return data def run(path, force_reprocess, name="data.pkl"): if (not force_reprocess) and os.path.exists(name): with open(name, "rb") as fil: ret = pickle.load(fil) else: train_data = load(path) train_data = indexize(train_data) ret = vocab, train_data #pdb.set_trace() logger.log("vocab len:", len(ret[0])) logger.log(" data len:", len(ret[1])) with open("data.pkl", "wb") as fil: pickle.dump(ret, fil) return ret if __name__ == "__main__": run(C.data_path, C.force_reprocess) fitlog.add_best_metric(2333, "test") fitlog.finish()
T_0=epochs, T_mult=1, eta_min=1e-6) scaler = GradScaler() for epoch in range(epochs): train_one_epoch(fold, epoch, model, criterion, optimizer, trn_loader, device, scheduler=scheduler) valid_one_epoch(fold, epoch, model, criterion, optimizer, val_loader, device, scheduler=None) print(f'fold {fold} max dice {max_dice}') dice.append(max_dice.cpu().numpy()) if not args.nfold: break if args.nfold: fitlog.add_best_metric({"val": {"mean_dice": np.mean(dice)}}) fitlog.finish()