def tester(cfg): print('testing') dataloader_test, dataset_size_test = data.make_dataloader(cfg, is_train=False) model = modeling.build(cfg) if cfg.TEST.MODEL.startswith('.'): load_path = cfg.TEST.MODEL.replace(".", os.path.realpath(".")) else: load_path = cfg.TEST.MODEL model = torch.load(load_path) model.cuda() vis_test = Visualization(cfg, dataset_size_test, is_train=False) writer_path = os.path.join(cfg.VISUALIZATION_DIRECTORY, cfg.EXPERIMENT_NAME) writer = SummaryWriter(writer_path) total_iterations = 0 total_iterations_val = 0 model.eval() epoch = 1 for iteration, batch in enumerate(dataloader_test): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] frame_start = batch[13] frame_end = batch[14] loss, individual_loss, pred_start, pred_end, attention, atten_loss = model( videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz, frame_start, frame_end) aux = vis_test.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps) total_iterations_val += 1 a = vis_test.plot(epoch)
def main(): try: cfg = prepare_experiment(eval_cfg, 'e') model = YOLOv2Model(cfg, training=False) eval_dataloader = make_dataloader(cfg, training=False) model.eval(eval_dataloader) except KeyboardInterrupt: handle_keyboard_interruption(cfg) except: handle_other_exception(cfg)
def main(): try: cfg = prepare_experiment(train_cfg, 't') model = YOLOv2Model(cfg, training=True) train_dataloader, eval_dataloader = make_dataloader(cfg, training=True) model.train(train_dataloader, eval_dataloader) except KeyboardInterrupt: handle_keyboard_interruption(cfg) except: handle_other_exception(cfg)
def __init__(self, dataset, n_epochs, epochs_per_line, lr, lr_schedule, batch_size, save_frequency, incompressible_flow, empirical_vars, data_root_dir='./', n_classes=None, n_data_points=None, init_identity=True): super().__init__() self.dataset = dataset self.n_epochs = n_epochs self.epochs_per_line = epochs_per_line self.lr = lr self.lr_schedule = lr_schedule self.batch_size = batch_size self.save_frequency = min(save_frequency, n_epochs) self.incompressible_flow = bool(incompressible_flow) self.empirical_vars = bool(empirical_vars) self.init_identity = bool(init_identity) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.timestamp = str(int(time())) if self.dataset == '10d': self.net = construct_net_10d(coupling_block='gin' if self.incompressible_flow else 'glow', init_identity=init_identity) assert type(n_classes) is int self.n_classes = n_classes self.n_dims = 10 self.save_dir = os.path.join('./artificial_data_save/', self.timestamp) self.latent, self.data, self.target = generate_artificial_data_10d(self.n_classes, n_data_points) self.train_loader = make_dataloader(self.data, self.target, self.batch_size) elif self.dataset == 'EMNIST': if not init_identity: raise RuntimeError('init_identity=False not implemented for EMNIST experiments') self.net = construct_net_emnist(coupling_block='gin' if self.incompressible_flow else 'glow') self.n_classes = 10 self.n_dims = 28*28 self.save_dir = os.path.join('./emnist_save/', self.timestamp) self.data_root_dir = data_root_dir self.train_loader = make_dataloader_emnist(batch_size=self.batch_size, train=True, root_dir=self.data_root_dir) self.test_loader = make_dataloader_emnist(batch_size=1000, train=False, root_dir=self.data_root_dir) else: raise RuntimeError("Check dataset name. Doesn't match.") if not empirical_vars: self.mu = nn.Parameter(torch.zeros(self.n_classes, self.n_dims).to(self.device)).requires_grad_() self.log_sig = nn.Parameter(torch.zeros(self.n_classes, self.n_dims).to(self.device)).requires_grad_() # initialize these parameters to reasonable values self.set_mu_sig(init=True) self.to(self.device)
def train(args, model, optimizer, criterion, gids=None): """ Training """ tb = SummaryWriter(comment='_{}'.format(args.loss_type)) model.train() train_loss = [] t0 = int(time.time()) for epoch in range(args.num_epochs): if epoch % 10 == 0: dataloader = make_dataloader(args, epoch) print('=== Epoch {}/{} ==='.format(epoch, args.num_epochs)) adjust_lr_exp(optimizer, args.lr, epoch+1, args.num_epochs, args.lr_decay_start_epoch) for iteration, (image, label) in enumerate(dataloader): if args.cuda: image, label = image.cuda(gids[0]), label.cuda(gids[0]) if args.loss_type == 'softmax': _, logits = model(image) loss = criterion(logits, label) elif args.loss_type == 'softmax-triplet': feat, logits = model(image) loss = args.alpha * criterion['softmax'](logits, label) \ + (1 - args.alpha) * criterion['triplet'](feat, label) else: feat = model(image) loss = criterion(feat, label) optimizer.zero_grad() loss.backward() optimizer.step() # print training info train_loss.append(loss.item()) if args.loss_type == 'dmml': print('Episode: {}, Loss: {:.6f}'.format(iteration, loss.item())) else: print('Batch: {}, Loss: {:.6f}'.format(iteration, loss.item())) avg_training_loss = np.mean(train_loss) print('Average loss: {:.6f}'.format(avg_training_loss)) tb.add_scalar('Train loss', avg_training_loss, epoch+1) train_loss = [] t = int(time.time()) print('Time elapsed: {}h {}m'.format((t - t0) // 3600, ((t - t0) % 3600) // 60)) if epoch % 100 == 0 and epoch >= args.num_epochs // 2: model_save_path = os.path.join(args.exp_root, 'model_{}.pth'.format(epoch)) if gids is not None and len(gids) > 1: torch.save(model.module.state_dict(), model_save_path) else: torch.save(model.state_dict(), model_save_path) print('Model {} saved.'.format(epoch)) model_save_path = os.path.join(args.exp_root, 'model_last.pth'.format(epoch)) if gids is not None and len(gids) > 1: torch.save(model.module.state_dict(), model_save_path) else: torch.save(model.state_dict(), model_save_path) print('Final model saved.') tb.close() eval(gid=gids[0], dataset=args.dataset, dataset_root=args.dataset_root, which='last', exp_dir=args.exp_root)
def _train(): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if device != "cpu": torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained( 'bert-base-japanese-whole-word-masking', do_lower_case=False, tokenize_chinese_chars=False) logger.info("loading data") logger.info(f"loading data {args.train_file}") train_dataloader = make_dataloader(args.train_file, args.max_seq_length, args.train_batch_size, tokenizer) valid_dataloader = make_dataloader(args.valid_file, args.max_seq_length, args.train_batch_size, tokenizer) logger.info("building model") model = BertMouth.from_pretrained(args.bert_model, num_labels=tokenizer.vocab_size) model.to(device) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] logger.info("setting optimizer") no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=0, t_total=optimization_steps) loss_fct = CrossEntropyLoss(ignore_index=0) def calc_batch_loss(batch): batch = tuple(t.to(device) for t in batch) input_ids, y, input_mask, input_type_id, masked_pos = batch logits = model(input_ids, input_type_id, input_mask) logits = logits.view(-1, tokenizer.vocab_size) y = y.view(-1) loss = loss_fct(logits, y) return loss logger.info("train starts") model.train() summary_writer = SummaryWriter(log_dir="logs") generated_texts = [] try: for epoch in trange(int(args.num_train_epochs), desc="Epoch"): train_loss = 0. running_num = 0 for step, batch in enumerate(train_dataloader): loss = calc_batch_loss(batch) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() train_loss += loss.item() running_num += len(batch[0]) logger.info("[{0} epochs] " "train loss: {1:.3g} ".format(epoch + 1, train_loss / running_num)) summary_writer.add_scalar("train_loss", train_loss / running_num, epoch) model.eval() valid_loss = 0. valid_num = 0 for batch in valid_dataloader: valid_loss += calc_batch_loss(batch).item() valid_num += len(batch[0]) generated_texts.append( generate(tokenizer=tokenizer, device=device, length=25, max_length=args.max_seq_length, model=model)) logger.info("[{0} epochs] valid loss: {1:.3g}".format( epoch + 1, valid_loss / valid_num)) summary_writer.add_scalar("val_loss", valid_loss / valid_num, epoch) model.train() except KeyboardInterrupt: logger.info("KeyboardInterrupt") summary_writer.close() dt_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') save(args, model, tokenizer, str(dt_now))
def train(args, tokenizer, device): logger.info("loading data") train_dataloader = make_dataloader(args.train_file, args.max_seq_length, args.train_batch_size, tokenizer) valid_dataloader = make_dataloader(args.valid_file, args.max_seq_length, args.train_batch_size, tokenizer) logger.info("building model") model = BertMouth.from_pretrained(args.bert_model, num_labels=tokenizer.vocab_size) model.to(device) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] logger.info("setting optimizer") no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=optimization_steps) loss_fct = CrossEntropyLoss(ignore_index=0) def calc_batch_loss(batch): batch = tuple(t.to(device) for t in batch) input_ids, y, input_mask, input_type_id, masked_pos = batch next_sentence_label = torch.tensor([1 for _ in range(len(input_ids))]).to(device) masked_lm_labels = input_ids.clone() masked_lm_labels[masked_lm_labels == 4] = -100 outputs = model(input_ids=input_ids, token_type_ids=input_type_id, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_label) logits = outputs[0].view(-1, tokenizer.vocab_size) y = y.view(-1) loss = loss_fct(logits, y) + outputs[1].item() + outputs[2].item() return loss logger.info("train starts") model.train() summary_writer = SummaryWriter(log_dir="logs") generated_texts = [] try: for epoch in trange(int(args.num_train_epochs), desc="Epoch"): train_loss = 0. running_num = 0 for step, batch in enumerate(train_dataloader): loss = calc_batch_loss(batch) loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() train_loss += loss.item() running_num += len(batch[0]) logger.info("[{0} epochs] " "train loss: {1:.3g} ".format(epoch + 1, train_loss / running_num)) summary_writer.add_scalar("train_loss", train_loss / running_num, epoch) model.eval() valid_loss = 0. valid_num = 0 for batch in valid_dataloader: valid_loss += calc_batch_loss(batch).item() valid_num += len(batch[0]) generated_texts.append(generate(tokenizer=tokenizer, device=device, length=25, max_length=args.max_seq_length, model=model)) logger.info("[{0} epochs] valid loss: {1:.3g}".format(epoch + 1, valid_loss / valid_num)) summary_writer.add_scalar("val_loss", valid_loss / valid_num, epoch) model.train() except KeyboardInterrupt: logger.info("KeyboardInterrupt") summary_writer.close() dt_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') save(args, model, tokenizer, str(dt_now))
def trainer(cfg): print('trainer') dataloader_train, dataset_size_train = data.make_dataloader(cfg, is_train=True) dataloader_test, dataset_size_test = data.make_dataloader(cfg, is_train=False) model = modeling.build(cfg) model.cuda() #model = torch.load("/home/crodriguezo/projects/phd/moment-localization-with-NLP/mlnlp_lastversion/checkpoints/anet_config7/model_epoch_80") optimizer = solver.make_optimizer(cfg, model) vis_train = Visualization(cfg, dataset_size_train) vis_test = Visualization(cfg, dataset_size_test, is_train=False) writer_path = os.path.join(cfg.VISUALIZATION_DIRECTORY, cfg.EXPERIMENT_NAME) writer = SummaryWriter(writer_path) total_iterations = 0 total_iterations_val = 0 for epoch in range(cfg.EPOCHS): print("Epoch {}".format(epoch)) model.train() for iteration, batch in enumerate(dataloader_train): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] loss, individual_loss, pred_start, pred_end, attention, atten_loss = model( videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz) print("Loss :{}".format(loss)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() vis_train.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps) writer.add_scalar(f'mlnlp/Progress_Loss', loss.item(), total_iterations) writer.add_scalar(f'mlnlp/Progress_Attention_Loss', atten_loss.item(), total_iterations) writer.add_scalar(f'mlnlp/Progress_Mean_IoU', vis_train.mIoU[-1], total_iterations) total_iterations += 1. writer.add_scalar(f'mlnlp/Train_Loss', np.mean(vis_train.loss), epoch) writer.add_scalar(f'mlnlp/Train_Mean_IoU', np.mean(vis_train.mIoU), epoch) vis_train.plot(epoch) torch.save( model, "./checkpoints/{}/model_epoch_{}".format(cfg.EXPERIMENT_NAME, epoch)) model.eval() for iteration, batch in enumerate(dataloader_test): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] loss, individual_loss, pred_start, pred_end, attention, atten_loss = model( videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz) vis_test.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps) #print(loss) writer.add_scalar(f'mlnlp/Progress_Valid_Loss', loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Atten_Loss', atten_loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Mean_IoU', vis_test.mIoU[-1], total_iterations_val) total_iterations_val += 1 writer.add_scalar(f'mlnlp/Valid_Loss', np.mean(vis_test.loss), epoch) writer.add_scalar(f'mlnlp/Valid_Mean_IoU', np.mean(vis_test.mIoU), epoch) a = vis_test.plot(epoch) writer.add_scalars(f'mlnlp/Valid_tIoU_th', a, epoch)
def tester(cfg): print('testing') dataloader_test, dataset_size_test = data.make_dataloader(cfg, is_train=False) model = modeling.build(cfg) # torch.nn.Module.dump_patches = True model = torch.load(cfg.TEST.MODEL) # print(model) model.cuda() vis_test = Visualization(cfg, dataset_size_test, is_train=False) writer_path = os.path.join(cfg.VISUALIZATION_DIRECTORY, cfg.EXPERIMENT_NAME) writer = SummaryWriter(writer_path) total_iterations = 0 total_iterations_val = 0 model.eval() epoch = 1 results_data = {} for iteration, batch in enumerate(dataloader_test): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] objects = batch[13].cuda() objects_lengths = batch[14].cuda() humans = batch[15].cuda() humans_lengths = batch[16].cuda() loss, individual_loss, pred_start, pred_end, attention,atten_loss, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO = model(videoFeat, videoFeat_lengths, \ objects, objects_lengths, \ humans, humans_lengths, \ tokens, tokens_lengths, \ start, end, localiz) aux = vis_test.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO) total_iterations_val += 1 for k, v in aux.items(): results_data[k] = v
def trainer(cfg): print('trainer') dataloader_train, dataset_size_train = data.make_dataloader(cfg, is_train=True) dataloader_test, dataset_size_test = data.make_dataloader(cfg, is_train=False) model = modeling.build(cfg) model.cuda() optimizer = solver.make_optimizer(cfg, model) scheduler = StepLR(optimizer, step_size=6, gamma=0.01) vis_train = Visualization(cfg, dataset_size_train) vis_test = Visualization(cfg, dataset_size_test, is_train=False) writer_path = os.path.join(cfg.VISUALIZATION_DIRECTORY, cfg.EXPERIMENT_NAME) writer = SummaryWriter(writer_path) total_iterations = 0 total_iterations_val = 0 for epoch in range(cfg.EPOCHS): # Decay Learning Rate # print("Epoch {}".format(epoch)) print('Epoch:', epoch, 'LR:', scheduler.get_lr()) model.train() for iteration, batch in enumerate(dataloader_train): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] objects = batch[13].cuda() objects_lengths = batch[14].cuda() humans = batch[15].cuda() humans_lengths = batch[16].cuda() loss, individual_loss, pred_start, pred_end, attention, atten_loss, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO = model(videoFeat, videoFeat_lengths, \ objects, objects_lengths, \ humans, humans_lengths, \ tokens, tokens_lengths, \ start, end, localiz) # print("Loss :{}".format(loss)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() vis_train.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO) writer.add_scalar(f'mlnlp/Progress_Loss', loss.item(), total_iterations) writer.add_scalar(f'mlnlp/Progress_Attention_Loss', atten_loss.item(), total_iterations) writer.add_scalar(f'mlnlp/Progress_Mean_IoU', vis_train.mIoU[-1], total_iterations) total_iterations += 1. writer.add_scalar(f'mlnlp/Train_Loss', np.mean(vis_train.loss), epoch) writer.add_scalar(f'mlnlp/Train_Mean_IoU', np.mean(vis_train.mIoU), epoch) scheduler.step() vis_train.plot(epoch) torch.save( model, "./checkpoints/{}/model_epoch_{}".format(cfg.EXPERIMENT_NAME, epoch)) model.eval() for iteration, batch in enumerate(dataloader_test): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] objects = batch[13].cuda() objects_lengths = batch[14].cuda() humans = batch[15].cuda() humans_lengths = batch[16].cuda() loss, individual_loss, pred_start, pred_end, attention,atten_loss, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO = model(videoFeat, videoFeat_lengths, \ objects, objects_lengths, \ humans, humans_lengths, \ tokens, tokens_lengths, \ start, end, localiz) vis_test.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, attention, atten_loss, time_starts, time_ends, factors, fps, attentionNodeQueryHO, attentionNodeQueryVH, attentionNodeQueryVO) #print(index) writer.add_scalar(f'mlnlp/Progress_Valid_Loss', loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Atten_Loss', atten_loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Mean_IoU', vis_test.mIoU[-1], total_iterations_val) total_iterations_val += 1 writer.add_scalar(f'mlnlp/Valid_Loss', np.mean(vis_test.loss), epoch) writer.add_scalar(f'mlnlp/Valid_Mean_IoU', np.mean(vis_test.mIoU), epoch) a = vis_test.plot(epoch) writer.add_scalars(f'mlnlp/Valid_tIoU_th', a, epoch)
def train(args, tokenizer, device): logger.info("loading data") # 教師データを読み込む train_dataloader = make_dataloader(args.train_file, args.max_seq_length, args.train_batch_size, tokenizer) # 評価データを読み込む valid_dataloader = make_dataloader(args.valid_file, args.max_seq_length, args.train_batch_size, tokenizer) logger.info("building model") # BertMouthモデルの事前学習モデル(おそらく通常のBERTモデルと同じ構造)を読み込む model = BertMouth.from_pretrained(args.bert_model, num_labels=tokenizer.vocab_size) # GPU/CPUを設定する model.to(device) # 名前がpoolerではないパラメータを取得する param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] logger.info("setting optimizer") # decayに含まれるパラメータとそうでないパラメータにわける no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # ステップは教師データの大きさ * epoch optimization_steps = len(train_dataloader) * args.num_train_epochs # 最適化アルゴリズムの指定 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # スケジューラーは学習率を調整してくれる scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=optimization_steps) loss_fct = CrossEntropyLoss(ignore_index=0) def calc_batch_loss(batch): # データ型をGPU/CPUにあわせて変更 batch = tuple(t.to(device) for t in batch) input_ids, y, input_mask, input_type_id, masked_pos = batch # モデルから出力を計算 logits = model(input_ids, input_type_id, input_mask) logits = logits.view(-1, tokenizer.vocab_size) y = y.view(-1) # 損失を計算 loss = loss_fct(logits, y) return loss logger.info("train starts") # モデルを学習モードに変更 model.train() # ログの出力先を指定 summary_writer = SummaryWriter(log_dir="logs") generated_texts = [] try: # trangeは進捗バー表示ができる便利ツール for epoch in trange(int(args.num_train_epochs), desc="Epoch"): train_loss = 0. running_num = 0 # 教師データをバッチ別に処理していく for step, batch in enumerate(train_dataloader): # 損失を計算 loss = calc_batch_loss(batch) # 勾配を計算、パラメーターを更新 loss.backward() optimizer.step() scheduler.step() # 勾配は適宜初期化? optimizer.zero_grad() # 出力用、損失の合計とステップ数 train_loss += loss.item() running_num += len(batch[0]) logger.info("[{0} epochs] " "train loss: {1:.3g} ".format(epoch + 1, train_loss / running_num)) summary_writer.add_scalar("train_loss", train_loss / running_num, epoch) # モデルを評価モードにする model.eval() valid_loss = 0. valid_num = 0 for batch in valid_dataloader: # 評価データに対して予測を適用、出力用 valid_loss += calc_batch_loss(batch).item() valid_num += len(batch[0]) # 出力用リストにデータを追加 generated_texts.append(generate(tokenizer=tokenizer, device=device, length=25, max_length=args.max_seq_length, model=model)) logger.info("[{0} epochs] valid loss: {1:.3g}".format(epoch + 1, valid_loss / valid_num)) summary_writer.add_scalar("val_loss", valid_loss / valid_num, epoch) # 学習モードに再度切り替え model.train() except KeyboardInterrupt: logger.info("KeyboardInterrupt") # ログ出力を終了 summary_writer.close() dt_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') # モデルを保存 save(args, model, tokenizer, str(dt_now))
def trainer(cfg): print('trainer') dataloader_train, dataset_size_train = data.make_dataloader(cfg, is_train=True) dataloader_test, dataset_size_test = data.make_dataloader(cfg, is_train=False) print(dataset_size_train) print(dataset_size_test) model = modeling.build(cfg) if cfg.MODE_TRAIN == 'resume': model = torch.load("./checkpoints/{}/model_{}_epoch_{}".format( cfg.EXPERIMENT_NAME, cfg.MODEL_NAME, cfg.MODE_TRAIN_RESUME_EPOCH)) model = torch.load("./checkpoints/{}/model_{}".format( cfg.EXPERIMENT_NAME, cfg.MODEL_NAME)) model.cuda() optimizer = solver.make_optimizer(cfg, model) #model = torch.load("/home/crodriguezo/projects/phd/moment-localization-with-NLP/mlnlp_lastversion/checkpoints/anet_config7/model_epoch_80") vis_train = Visualization(cfg, dataset_size_train) vis_test = Visualization(cfg, dataset_size_test, is_train=False) writer_path = os.path.join(cfg.VISUALIZATION_DIRECTORY, cfg.EXPERIMENT_NAME) writer = SummaryWriter(writer_path) total_iterations = 0 total_iterations_val = 0 cfg.EPOCHS = 1 for epoch in range(cfg.EPOCHS): model.eval() sumloss = 0 sumsample = 0 with torch.no_grad(): for iteration, batch in enumerate(dataloader_test): index = batch[0] videoFeat = batch[1].cuda() videoFeat_lengths = batch[2].cuda() tokens = batch[3].cuda() tokens_lengths = batch[4].cuda() if cfg.MODEL_NAME == 'TMLGA': start = batch[5].cuda() end = batch[6].cuda() localiz = batch[7].cuda() frame_start = batch[13] frame_end = batch[14] else: start = batch[5] end = batch[6] localiz = batch[7] frame_start = batch[13].cuda() frame_end = batch[14].cuda() localiz_lengths = batch[8] time_starts = batch[9] time_ends = batch[10] factors = batch[11] fps = batch[12] duration = batch[15] vid_names = batch[16] loss, individual_loss, pred_start, pred_end, attention, atten_loss = model( videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz, frame_start, frame_end) sumloss += loss.item() * float(videoFeat.shape[0]) sumsample += videoFeat.shape[0] # print("Test_Loss :{}".format(loss)) vis_test.run(index, pred_start, pred_end, start, end, videoFeat_lengths, epoch, loss.detach(), individual_loss, \ attention,atten_loss, time_starts, time_ends, factors, fps, duration,vid_names) #print(loss) writer.add_scalar(f'mlnlp/Progress_Valid_Loss', loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Atten_Loss', atten_loss.item(), total_iterations_val) writer.add_scalar(f'mlnlp/Progress_Valid_Mean_IoU', vis_test.mIoU[-1], total_iterations_val) total_iterations_val += 1 # del videoFeat,videoFeat_lengths,tokens,tokens_lengths,start,end,localiz # torch.cuda.empty_cache() print("Test_Loss :{}".format(sumloss / sumsample)) writer.add_scalar(f'mlnlp/Valid_Loss', np.mean(vis_test.loss), epoch) writer.add_scalar(f'mlnlp/Valid_Mean_IoU', np.mean(vis_test.mIoU), epoch) a = vis_test.plot(epoch) writer.add_scalars(f'mlnlp/Valid_tIoU_th', a, epoch)