def _infer(model, root_path, test_loader=None): args = get_args() is_test = True if args.test_debug else False # False로 바꿔야대~~~~~~~~~~~~~~~~~~~~~~~~!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if test_loader is None: is_test = True test_loader = get_dataloader(root=os.path.join(root_path, 'test_data'), fnames=None, split='test', mask_channels=args.mask_channels, batch_size=args.batch_size, num_workers=args.num_workers) # test_loader = data_loader(root=os.path.join(root_path, 'test_data'), phase='test', # batch_size=args.batch_size, num_workers=args.num_workers) x_hats = [] fnames = [] desc = 'infer...' model.eval() with torch.no_grad(): for step, data in tqdm(enumerate(test_loader), desc=desc, total=len(test_loader), disable=use_nsml): if not is_test: fname, x_input, mask, _ = data else: fname, x_input, mask = data x_input = x_input.cuda() mask = mask.cuda() x_hat, _ = model(x_input, mask) # PConvnet # x_mask = torch.cat([x_input, mask], dim=1) # else # x_hat = model(x_mask) # else x_hat = compose(x_input, x_hat, mask) # save_image(x_hat, os.path.join('test_output', 'x_hat_%03d.png' % step)) # save_image(x_hat, os.path.join('val_output', 'x_hat_%03d.png' % step)) x_hats.append(x_hat.cpu()) fnames = fnames + list(fname) x_hats = torch.cat(x_hats, dim=0) return fnames, x_hats
def _infer(model, root_path, test_loader=None): args = get_args() is_test = False if test_loader is None: is_test = True test_loader = get_dataloader(root=os.path.join(root_path, 'test_data'), fnames=None, split='test', bbox_constraint=None, mask_channels=args.mask_channels, batch_size=args.batch_size, num_workers=args.num_workers) x_hats = [] fnames = [] desc = 'infer...' model.eval() with torch.no_grad(): for data in tqdm(test_loader, desc=desc, total=len(test_loader), disable=not use_nsml): if not is_test: fname, x_input, mask, _ = data else: fname, x_input, mask = data x_input = x_input.cuda() mask = mask.cuda() x_mask = torch.cat([x_input, mask], dim=1) x_hat = model(x_mask) x_hat = compose(x_input, x_hat, mask) x_hats.append(x_hat.cpu()) fnames = fnames + list(fname) x_hats = torch.cat(x_hats, dim=0) return fnames, x_hats
def main(): seed_everything() args = get_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') netG = InpaintGeneratorLight() netD = Discriminator() print('################################################################') print('Total number of parameters * 4:', (count_parameters(netG) + count_parameters(netD)) * 4) print('################################################################') netG = netG.to(device) netD = netD.to(device) optimG = torch.optim.Adam(netG.parameters(), lr=args.lr, betas=(0.0, 0.999)) optimD = torch.optim.Adam(netD.parameters(), lr=args.lr * 0.1, betas=(0.0, 0.999)) save, load = bind_nsml(netG, optimG) if args.pause == 1: nsml.paused(scope=locals()) adversarial_loss = AdversarialLoss() l1_loss = nn.L1Loss() # load current_epoch = 0 if not use_nsml: writer = SummaryWriter(os.path.join('logs', args.nickname)) if args.load: netG_name = os.path.join('checkpoints', args.nickname, 'netG_%03d.pth' % args.load_epoch) netD_name = os.path.join('checkpoints', args.nickname, 'netD_%03d.pth' % args.load_epoch) netG_dict = torch.load(netG_name) netD_dict = torch.load(netD_name) netG.load_state_dict(netG_dict['state_dict']) netD.load_state_dict(netD_dict['state_dict']) current_epoch = args.load_epoch + 1 print('loaded') if args.mode == 'train': path_train = os.path.join(dir_data_root, 'train') path_train_data = os.path.join(dir_data_root, 'train', 'train_data') # fold fnames = os.listdir(path_train_data) if args.debug: fnames = fnames[:1000] random.shuffle(fnames) val_ratio = 0.1 train_fnames = fnames[:-int(len(fnames) * val_ratio)] val_fnames = fnames[-int(len(fnames) * val_ratio):] postfix = dict() total_step = 0 start = time.time() # for epoch in trange(args.num_epochs, disable=use_nsml): for epoch in range(current_epoch, args.num_epochs): if epoch < args.bbox_epochs[0]: bbox_constraint = 0.25 elif epoch < args.bbox_epochs[1]: bbox_constraint = 0.75 else: bbox_constraint = 1.0 tr_loader = get_dataloader(path_train_data, train_fnames, 'train', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) val_loader = get_dataloader(path_train_data, val_fnames, 'val', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) print('train:', len(tr_loader) * args.batch_size, 'val:', len(val_loader) * args.batch_size) # if epoch >= args.lr_decay_epoch: # optim.param_groups[0]['lr'] *= 0.1 pbar = tqdm(enumerate(tr_loader), total=len(tr_loader), disable=True) for step, (_, x_input, mask, x_GT) in pbar: total_step += 1 x_input = x_input.to(device) mask = mask.to(device) x_GT = x_GT.to(device) x_mask = torch.cat([x_input, mask], dim=1) x_hat = netG(x_mask) x_composed = compose(x_input, x_hat, mask) ########################################### # update D network ########################################### netD.zero_grad() netD_real = netD(x_GT) net_D_real_loss = adversarial_loss(netD_real, True) netD_fake = netD(x_hat) netD_fake_loss = adversarial_loss(netD_fake, False) netD_loss = net_D_real_loss + netD_fake_loss netD_loss.backward(retain_graph=True) optimD.step() ########################################### # update G network ########################################### netD.zero_grad() netG_fake = netD(x_hat) #.view(-1) 해야할 수도 netG_fake_loss = adversarial_loss(netG_fake, True) * 0.1 # netG_L1_loss = inpainting_loss(x_hat, x_GT, mask) netG_L1_loss = l1_loss(x_hat, x_GT) / torch.mean(mask) netG_loss = netG_fake_loss + netG_L1_loss netG_loss.backward() optimG.step() postfix['netD_loss'] = netD_loss.item() postfix['netG_loss'] = netG_loss.item() postfix['epoch'] = epoch postfix['step_'] = step postfix['total_step'] = total_step postfix['steps_per_epoch'] = len(tr_loader) if step != 0 and step % (args.eval_every - 1) == 0: metric_eval = local_eval(netG, val_loader, path_train_data) postfix['metric_eval'] = metric_eval print('metric eval:', metric_eval) if not use_nsml: sample_dir = os.path.join('samples', args.nickname) os.makedirs(sample_dir, exist_ok=True) vutils.save_image( x_GT, os.path.join(sample_dir, 'x_GT_%03d.png' % epoch), normalize=True) vutils.save_image(x_input, os.path.join( sample_dir, 'x_input_%03d.png' % epoch), normalize=True) vutils.save_image(x_hat, os.path.join( sample_dir, 'x_hat_%03d.png' % epoch), normalize=True) vutils.save_image( mask, os.path.join(sample_dir, 'mask_%03d.png' % epoch), normalize=True) vutils.save_image(x_composed, os.path.join( sample_dir, 'x_composed_%03d_%.1f.png' % (epoch, metric_eval)), normalize=True) writer.add_scalar('train/netD_loss', netD_loss.item(), epoch) writer.add_scalar('train/netG_loss', netG_loss.item(), epoch) if step % args.print_every == 0: print( "[%d/%d][%d/%d] time: %.2f," "netG_gan_loss: %.2f, netG_L1_loss: %.2f, netD_loss: %.2f" % (epoch, args.num_epochs, step, len(tr_loader), time.time() - start, netG_fake_loss.item(), netG_L1_loss.item(), netD_loss.item())) if use_nsml: nsml.report(**postfix, scope=locals(), step=total_step) if use_nsml: nsml.save(epoch) else: checkpoint_dir = os.path.join('checkpoints', args.nickname) os.makedirs(checkpoint_dir, exist_ok=True) netG_dict = {'state_dict': netG.state_dict()} netD_dict = {'state_dict': netD.state_dict()} torch.save( netG_dict, os.path.join(checkpoint_dir, 'netG_%03d.pth' % epoch)) torch.save( netD_dict, os.path.join(checkpoint_dir, 'netD_%03d.pth' % epoch)) print('saved')
def train(params): # initialize experiment logger = init_experiment(params, logger_filename=params.logger_filename) if params.bilstm: # dataloader dataloader_train, dataloader_dev, dataloader_test, vocab = get_dataloader_for_bilstmtagger(params) # bilstm-crf model model = BiLSTMTagger(params, vocab) model.cuda() # trainer trainer = BaseTrainer(params, model) elif params.coach: # dataloader dataloader_train, dataloader_dev, dataloader_test, vocab = get_dataloader_for_coach(params) # coach model binary_tagger = BiLSTMTagger(params, vocab) entity_predictor = EntityPredictor(params) binary_tagger.cuda() entity_predictor.cuda() # trainer trainer = CoachTrainer(params, binary_tagger, entity_predictor) else: # dataloader dataloader_train, dataloader_dev, dataloader_test = get_dataloader(params) # BERT-based NER Tagger model = BertTagger(params) model.cuda() # trainer trainer = BaseTrainer(params, model) if params.conll and not params.joint: conll_trainloader, conll_devloader, conll_testloader = get_conll2003_dataloader(params.batch_size, params.tgt_dm) trainer.train_conll(conll_trainloader, conll_devloader, conll_testloader, params.tgt_dm) no_improvement_num = 0 best_f1 = 0 logger.info("Training on target domain ...") for e in range(params.epoch): logger.info("============== epoch %d ==============" % e) pbar = tqdm(enumerate(dataloader_train), total=len(dataloader_train)) if params.bilstm: loss_list = [] for i, (X, lengths, y) in pbar: X, lengths = X.cuda(), lengths.cuda() loss = trainer.train_step_for_bilstm(X, lengths, y) loss_list.append(loss) pbar.set_description("(Epoch {}) LOSS:{:.4f}".format(e, np.mean(loss_list))) logger.info("Finish training epoch %d. loss: %.4f" % (e, np.mean(loss_list))) elif params.coach: loss_bin_list, loss_entity_list = [], [] for i, (X, lengths, y_bin, y_final) in pbar: X, lengths = X.cuda(), lengths.cuda() loss_bin, loss_entityname = trainer.train_step(X, lengths, y_bin, y_final) loss_bin_list.append(loss_bin) loss_entity_list.append(loss_entityname) pbar.set_description("(Epoch {}) LOSS BIN:{:.4f}; LOSS ENTITY:{:.4f}".format(e, np.mean(loss_bin_list), np.mean(loss_entity_list))) logger.info("Finish training epoch %d. loss_bin: %.4f. loss_entity: %.4f" % (e, np.mean(loss_bin_list), np.mean(loss_entity_list))) else: loss_list = [] for i, (X, y) in pbar: X, y = X.cuda(), y.cuda() loss = trainer.train_step(X, y) loss_list.append(loss) pbar.set_description("(Epoch {}) LOSS:{:.4f}".format(e, np.mean(loss_list))) logger.info("Finish training epoch %d. loss: %.4f" % (e, np.mean(loss_list))) logger.info("============== Evaluate epoch %d on Train Set ==============" % e) f1_train = trainer.evaluate(dataloader_train, params.tgt_dm, use_bilstm=params.bilstm) logger.info("Evaluate on Train Set. F1: %.4f." % f1_train) logger.info("============== Evaluate epoch %d on Dev Set ==============" % e) f1_dev = trainer.evaluate(dataloader_dev, params.tgt_dm, use_bilstm=params.bilstm) logger.info("Evaluate on Dev Set. F1: %.4f." % f1_dev) logger.info("============== Evaluate epoch %d on Test Set ==============" % e) f1_test = trainer.evaluate(dataloader_test, params.tgt_dm, use_bilstm=params.bilstm) logger.info("Evaluate on Test Set. F1: %.4f." % f1_test) if f1_dev > best_f1: logger.info("Found better model!!") best_f1 = f1_dev no_improvement_num = 0 # trainer.save_model() else: no_improvement_num += 1 logger.info("No better model found (%d/%d)" % (no_improvement_num, params.early_stop)) if no_improvement_num >= params.early_stop: break
def main(params): logger = init_experiment(params, logger_filename=params.logger_filename) dataloader_tr, dataloader_val, dataloader_test, vocab = get_dataloader( params.tgt_dm, params.batch_size, params.tr, params.n_samples) coarse_slutagger = CoarseSLUTagger(params, vocab) coarse_slutagger = coarse_slutagger.cuda() dm_coarse = get_coarse_labels_for_domains() fine_predictor = FinePredictor(params, dm_coarse) fine_predictor = fine_predictor.cuda() # if params.tr: sent_repre_generator = SentRepreGenerator(params, vocab) sent_repre_generator = sent_repre_generator.cuda() slu_trainer = SLUTrainer(params, coarse_slutagger, fine_predictor, sent_repre_generator=sent_repre_generator) for e in range(params.epoch): loss_c_list = [] pbar = tqdm(enumerate(dataloader_tr), total=len(dataloader_tr)) logger.info("============== epoch {} ==============".format(e + 1)) if e < params.pretrained_epoch or e == 7 or e == 8 or e == 12 or e == 13 \ or e == 17 or e == 20: if params.tr: for i, (X, lengths, y_0, y_bin, y_final, y_dm, templates, tem_lengths) in pbar: X, lengths = X.cuda(), lengths.cuda() loss_chunking = slu_trainer.chunking_pretrain( X, lengths, y_0) loss_c_list.append(loss_chunking) pbar.set_description( "(Epoch {}) LOSS CHUNKING:{:.4f}".format( (e + 1), np.mean(loss_c_list))) else: for i, (X, lengths, y_0, y_bin, y_final, y_dm) in pbar: X, lengths = X.cuda(), lengths.cuda() loss_chunking = slu_trainer.chunking_pretrain( X, lengths, y_0) loss_c_list.append(loss_chunking) pbar.set_description( "(Epoch {}) LOSS CHUNKING:{:.4f}".format( (e + 1), np.mean(loss_c_list))) logger.info( "============== Evaluate Epoch {} ==============".format(e + 1)) bin_f1 = slu_trainer.chunking_eval(dataloader_val) logger.info( "Eval on dev set. Binary Slot-F1: {:.4f}".format(bin_f1)) bin_f1 = slu_trainer.chunking_eval(dataloader_test) logger.info( "Eval on test set. Binary Slot-F1: {:.4f}".format(bin_f1)) continue loss_bin_list, loss_slotname_list = [], [] if params.tr: loss_tem0_list, loss_tem1_list = [], [] # record = int(len(dataloader_tr) / 4) if params.tr: for i, (X, lengths, y_0, y_bin, y_final, y_dm, templates, tem_lengths) in pbar: X, lengths, templates, tem_lengths = X.cuda(), lengths.cuda( ), templates.cuda(), tem_lengths.cuda() loss_bin, loss_slotname, loss_tem0, loss_tem1 = slu_trainer.train_step( X, lengths, y_bin, y_final, y_dm, templates=templates, tem_lengths=tem_lengths, epoch=e) loss_bin_list.append(loss_bin) loss_slotname_list.append(loss_slotname) loss_tem0_list.append(loss_tem0) loss_tem1_list.append(loss_tem1) pbar.set_description( "(Epoch {}) LOSS BIN:{:.4f} LOSS SLOT:{:.4f} LOSS TEM0:{:.4f} LOSS TEM1:{:.4f}" .format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list), np.mean(loss_tem0_list), np.mean(loss_tem1_list))) else: for i, (X, lengths, y_0, y_bin, y_final, y_dm) in pbar: X, lengths = X.cuda(), lengths.cuda() loss_bin, loss_slotname = slu_trainer.train_step( X, lengths, y_bin, y_final, y_dm) loss_bin_list.append(loss_bin) loss_slotname_list.append(loss_slotname) pbar.set_description( "(Epoch {}) LOSS BIN:{:.4f} LOSS SLOT:{:.4f}".format( (e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list))) if params.tr: logger.info( "Finish training epoch {}. LOSS BIN:{:.4f} LOSS SLOT:{:.4f} LOSS TEM0:{:.4f} LOSS TEM1:{:.4f}" .format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list), np.mean(loss_tem0_list), np.mean(loss_tem1_list))) else: logger.info( "Finish training epoch {}. LOSS BIN:{:.4f} LOSS SLOT:{:.4f}". format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list))) logger.info( "============== Evaluate Epoch {} ==============".format(e + 1)) bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate( dataloader_val, istestset=False) logger.info( "Eval on dev set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.". format(bin_f1, final_f1)) bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate( dataloader_test, istestset=True) logger.info( "Eval on test set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.". format(bin_f1, final_f1)) if stop_training_flag == True: break
def test_coach(params): logger = init_experiment(params, logger_filename='test') # get dataloader dataloader_tr, dataloader_val, dataloader_test, vocab = get_dataloader( params.tgt_dm, params.batch_size, params.tr, params.n_samples) # _, _, dataloader_test, _ = get_dataloader(params.tgt_dm, params.batch_size, params.tr, params.n_samples) print(params.model_path) model_path = params.model_path opti_path = './experiments/coach_patience/atp_0/opti.pth' assert os.path.isfile(model_path) reloaded = torch.load(model_path) coarse_slutagger = CoarseSLUTagger(params, vocab) coarse_slutagger = coarse_slutagger.cuda() dm_coarse = get_coarse_labels_for_domains() fine_tagger = FinePredictor(params, dm_coarse) fine_tagger = fine_tagger.cuda() coarse_slutagger.load_state_dict(reloaded["coarse_tagger"]) fine_tagger.load_state_dict(reloaded["fine_tagger"]) coarse_tagger = coarse_slutagger.cuda() # fine_tagger.cuda() # model_parameters = [ # {"params": coarse_tagger.parameters()}, # {"params": fine_tagger.parameters()} # ] # optimizer = torch.optim.Adam(model_parameters, lr=self.lr) # optimizer.load_state_dict(torch.load(opti_path)) slu_trainer = SLUTrainer(params, coarse_tagger, fine_tagger) slu_trainer.optimizer.load_state_dict(torch.load(opti_path)) for e in range(params.epoch): logger.info("============== epoch {} ==============".format(e + 1)) loss_bin_list, loss_slotname_list = [], [] if params.tr: loss_tem0_list, loss_tem1_list = [], [] pbar = tqdm(enumerate(dataloader_tr), total=len(dataloader_tr)) # record = int(len(dataloader_tr) / 4) if params.tr: for i, (X, lengths, y_bin, y_final, y_dm, templates, tem_lengths) in pbar: X, lengths, templates, tem_lengths = X.cuda(), lengths.cuda( ), templates.cuda(), tem_lengths.cuda() loss_bin, loss_slotname, loss_tem0, loss_tem1 = slu_trainer.train_step( X, lengths, y_bin, y_final, y_dm, templates=templates, tem_lengths=tem_lengths, epoch=e) loss_bin_list.append(loss_bin) loss_slotname_list.append(loss_slotname) loss_tem0_list.append(loss_tem0) loss_tem1_list.append(loss_tem1) pbar.set_description( "(Epoch {}) LOSS BIN:{:.4f} LOSS SLOT:{:.4f} LOSS TEM0:{:.4f} LOSS TEM1:{:.4f}" .format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list), np.mean(loss_tem0_list), np.mean(loss_tem1_list))) else: for i, (X, lengths, y_bin, y_final, y_dm) in pbar: if i == 2: break # if i %record == 0 and i > 0: # logger.info("============== Evaluate Epoch {} {}==============".format(e+1, i)) # bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate(dataloader_val, istestset=False) # logger.info("Eval on dev set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.".format(bin_f1, final_f1)) # bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate(dataloader_test, istestset=True) # logger.info("Eval on test set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.".format(bin_f1, final_f1)) X, lengths = X.cuda(), lengths.cuda() loss_bin, loss_slotname = slu_trainer.train_step( X, lengths, y_bin, y_final, y_dm) loss_bin_list.append(loss_bin) loss_slotname_list.append(loss_slotname) pbar.set_description( "(Epoch {}) LOSS BIN:{:.4f} LOSS SLOT:{:.4f}".format( (e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list))) if params.tr: logger.info( "Finish training epoch {}. LOSS BIN:{:.4f} LOSS SLOT:{:.4f} LOSS TEM0:{:.4f} LOSS TEM1:{:.4f}" .format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list), np.mean(loss_tem0_list), np.mean(loss_tem1_list))) else: logger.info( "Finish training epoch {}. LOSS BIN:{:.4f} LOSS SLOT:{:.4f}". format((e + 1), np.mean(loss_bin_list), np.mean(loss_slotname_list))) logger.info( "============== Evaluate Epoch {} ==============".format(e + 1)) bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate( dataloader_val, istestset=False) logger.info( "Eval on dev set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.". format(bin_f1, final_f1)) bin_f1, final_f1, stop_training_flag = slu_trainer.evaluate( dataloader_test, istestset=True) logger.info( "Eval on test set. Binary Slot-F1: {:.4f}. Final Slot-F1: {:.4f}.". format(bin_f1, final_f1)) if stop_training_flag == True: break
def main(): seed_everything() args = get_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = PConvUNetNew() print('################################################################') print('Total number of parameters * 4:', count_parameters(model) * 4) print('################################################################') model = model.to(device) optim = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0, 0.999)) save, load = bind_nsml(model) if args.pause == 1: nsml.paused(scope=locals()) # load current_epoch = 0 if not use_nsml: writer = SummaryWriter(os.path.join('logs', args.nickname)) if args.load: fname = os.path.join('checkpoints', args.nickname, 'model_%03d.pth' % args.load_epoch) state = torch.load(fname) model.load_state_dict(state['model']) current_epoch = args.load_epoch + 1 print('loaded') path_test_data = 'data/test_data_original' test_loader = get_dataloader(root=os.path.join('data', 'test_data'), fnames=None, split='test', mask_channels=args.mask_channels, batch_size=args.batch_size, num_workers=args.num_workers) # test_loader = data_loader(root=os.path.join('data', 'test_data'), phase='test', # batch_size=args.batch_size, num_workers=args.num_workers) if args.mode == 'train': path_train = os.path.join(dir_data_root, 'train') path_train_data = os.path.join(dir_data_root, 'train', 'train_data') # tr_loader, val_loader = data_loader_with_split(path_train, train_split=(1 - args.val_ratio), # batch_size=args.batch_size, num_workers=args.num_workers) # fold fnames = os.listdir(path_train_data) if args.debug: fnames = fnames[:1000] random.shuffle(fnames) train_fnames = fnames[:-int(len(fnames) * args.val_ratio)] val_fnames = fnames[-int(len(fnames) * args.val_ratio):] tr_loader = get_dataloader(path_train_data, train_fnames, 'train', args.mask_channels, args.batch_size, args.num_workers) val_loader = get_dataloader(path_train_data, val_fnames, 'val', args.mask_channels, args.batch_size, args.num_workers) print('train:', len(tr_loader) * args.batch_size, 'val:', len(val_loader) * args.batch_size) if args.test_debug: metric_eval = local_eval(model, test_loader, path_test_data) # metric_eval = local_eval(model, val_loader, path_train_data) return postfix = dict() total_step = 0 start = time.time() best_val_loss = float('inf') for epoch in range(current_epoch, args.num_epochs): if epoch >= args.lr_decay_epoch: optim.param_groups[0]['lr'] = 0.0001 if epoch >= args.bn_freeze_epoch: model.freeze_enc_bn = True optim.param_groups[0]['lr'] = 0.00005 model.train() # if epoch < args.bbox_epochs[0]: # bbox_constraint = 0.3 # elif epoch < args.bbox_epochs[1]: # bbox_constraint = 0.7 # else: # bbox_constraint = 1.0 # tr_loader = get_dataloader(path_train_data, train_fnames, 'train', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) # val_loader = get_dataloader(path_train_data, val_fnames, 'val', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) # print('train:', len(tr_loader) * args.batch_size, 'val:', len(val_loader) * args.batch_size) for step, (fname, x_input, mask, x_GT) in enumerate(tr_loader): total_step += 1 x_GT = x_GT.to(device) x_input = x_input.to(device) mask = mask.to(device) model.zero_grad() x_hat, _ = model(x_input, mask) # PConvnet # x_mask = torch.cat([x_input, mask], dim=1) #else # x_hat = model(x_mask) #else # x_composed = compose(x_input, x_hat, mask) # loss = l1_loss(x_composed, x_GT) # loss = l1_loss(x_hat, x_GT) loss = inpainting_loss(x_hat, x_GT, mask) loss.backward() optim.step() if use_nsml: postfix['loss'] = loss.item() postfix['epoch'] = epoch postfix['step_'] = step postfix['total_step'] = total_step postfix['steps_per_epoch'] = len(tr_loader) nsml.report(**postfix, scope=locals(), step=total_step) if step % args.print_every == 0: print( "[%d/%d][%d/%d] time: %.2f, train_loss: %.6f, lr: %f" % (epoch, args.num_epochs, step, len(tr_loader), time.time() - start, loss.item(), optim.param_groups[0]['lr'])) metric_eval = local_eval(model, val_loader, path_train_data) if use_nsml: postfix['metric_eval'] = metric_eval nsml.report(**postfix, scope=locals(), step=total_step) else: writer.add_scalar('train/metric_eval', metric_eval, epoch) writer.add_scalar('train/loss', loss.item(), epoch) # sample_dir = os.path.join('samples', args.nickname) # os.makedirs(sample_dir, exist_ok=True) # vutils.save_image(x_GT, os.path.join(sample_dir, 'x_GT_%03d.png' % epoch), normalize=True) # vutils.save_image(x_input, os.path.join(sample_dir, 'x_input_%03d.png' % epoch), normalize=True) # vutils.save_image(x_hat, os.path.join(sample_dir, 'x_hat_%03d.png' % epoch), normalize=True) # vutils.save_image(mask, os.path.join(sample_dir, 'mask_%03d.png' % epoch), normalize=True) # vutils.save_image(x_composed, os.path.join(sample_dir, 'x_composed_%03d.png' % epoch), normalize=True) # save_image(x_GT, os.path.join(sample_dir, 'x_GT_%03d.png' % epoch)) # save_image(x_input, os.path.join(sample_dir, 'x_input_%03d.png' % epoch)) # save_image(x_hat, os.path.join(sample_dir, 'x_hat_%03d.png' % epoch)) # save_image(mask, os.path.join(sample_dir, 'x_mask_%03d.png' % epoch)) # save_image(x_composed, os.path.join(sample_dir, 'x_composed_%03d_%.2f.png' % (epoch, metric_eval))) if use_nsml: if metric_eval < best_val_loss: nsml.save(epoch) best_val_loss = metric_eval else: checkpoint_dir = os.path.join('checkpoints', args.nickname) os.makedirs(checkpoint_dir, exist_ok=True) state = {'model': model.state_dict()} torch.save( state, os.path.join(checkpoint_dir, 'model_%03d.pth' % epoch)) print('saved')