def job(tuning, params_path, devices, resume, save_interval): global params if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' # パラメーターを変えるときにseedも変えたい(seed averagingの効果を期待) seed = sum(ord(_) for _ in str(params.values())) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.benchmark = False exp_path = ROOT + f'experiments/{params["ex_name"]}/' os.environ['CUDA_VISIBLE_DEVICES'] = devices logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') if params['augmentation'] == 'soft': params['scale_limit'] = 0.2 params['brightness_limit'] = 0.1 elif params['augmentation'] == 'middle': params['scale_limit'] = 0.3 params['shear_limit'] = 4 params['brightness_limit'] = 0.1 params['contrast_limit'] = 0.1 else: raise ValueError train_transform, eval_transform = data_utils.build_transforms( scale_limit=params['scale_limit'], shear_limit=params['shear_limit'], brightness_limit=params['brightness_limit'], contrast_limit=params['contrast_limit'], ) data_loaders = data_utils.make_train_loaders( params=params, data_root=ROOT + 'input/' + params['data'], train_transform=train_transform, eval_transform=eval_transform, scale='S', test_size=0, class_topk=params['class_topk'], num_workers=8) model = models.LandmarkNet( n_classes=params['class_topk'], model_name=params['model_name'], pooling=params['pooling'], loss_module=params['loss'], s=params['s'], margin=params['margin'], theta_zero=params['theta_zero'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ).cuda() optimizer = utils.get_optim(params, model) criterion = nn.CrossEntropyLoss() scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6) start_epoch = 0 if len(devices.split(',')) > 1: model = nn.DataParallel(model) for epoch in range(start_epoch, params['epochs']): logger.info( f'Epoch {epoch}/{params["epochs"]} | lr: {optimizer.param_groups[0]["lr"]}' ) # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (_, x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') outputs = model(x, y) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) if i % 100 == 99: logger.info( f'{epoch+i/len(data_loaders["train"]):.2f}epoch | {setting} acc: {prec1.avg}' ) train_loss = losses.avg train_acc = prec1.avg writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if (epoch + 1) == params['epochs'] or (epoch + 1) % save_interval == 0: output_file_name = exp_path + f'ep{epoch}_' + setting + '.pth' utils.save_checkpoint(path=output_file_name, model=model, epoch=epoch, optimizer=optimizer, params=params) model = model.module datasets = ('roxford5k', 'rparis6k') results = eval_datasets(model, datasets=datasets, ms=False, tta_gem_p=1.0, logger=logger) if tuning: tuning_result = {} for d in datasets: for key in ['mapE', 'mapM', 'mapH']: mapE, mapM, mapH, mpE, mpM, mpH, kappas = results[d] tuning_result[d + '-' + key] = [eval(key)] utils.write_tuning_result(params, tuning_result, exp_path + 'tuning/results.csv')
def train(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') sample_dir = 'samples' os.makedirs(sample_dir, exist_ok=True) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5])]) mnist = torchvision.datasets.MNIST(root='data', train=True, transform=transform, download=True) data_loader = torch.utils.data.DataLoader(dataset=mnist, batch_size=params['batch_size'], shuffle=True) D = models.Discriminator(params['image_size'], params['hidden_size']) G = models.Generator(params['image_size'], params['latent_size'], params['hidden_size']) D = D.to(device) G = G.to(device) criterion = nn.BCELoss() d_optimizer = utils.get_optim(params, D) g_optimizer = utils.get_optim(params, G) d_losses = [] g_losses = [] total_step = len(data_loader) for epoch in range(params['epochs']): for i, (images, _) in enumerate(data_loader): # labelは使わない # (batch_size, 1, 28, 28) -> (batch_size, 1*28*28) b_size = images.size(0) images = images.reshape(b_size, -1).to(device) real_labels = torch.ones(b_size, 1).to(device) fake_labels = torch.zeros(b_size, 1).to(device) # Train discriminator outputs = D(images) d_loss_real = criterion(outputs, real_labels) real_score = outputs z = torch.randn(b_size, params['latent_size']).to(device) fake_images = G(z.detach()) outputs = D(fake_images) d_loss_fake = criterion(outputs, fake_labels) fake_score = outputs d_loss = d_loss_real + d_loss_fake d_optimizer.zero_grad() d_loss.backward() d_optimizer.step() # Train generator fake_images = G(z) outputs = D(fake_images) g_loss = criterion(outputs, real_labels) g_optimizer.zero_grad() g_loss.backward() g_optimizer.step() print( 'Epoch [{}/{}], step [{}/{}], d_loss: {:.4f}, g_loss: {:.4f}, D(x): {:.2f}, D(G(z)): {:.2f}' .format( epoch, params['epochs'], i + 1, total_step, d_loss.item(), g_loss.item(), real_score.mean().item(), fake_score.mean().item())) # .item():ゼロ次元Tensorから値を取り出す d_losses.append(d_loss.item()) g_losses.append(g_loss.item()) if (epoch + 1) == 1: images = images.reshape(b_size, 1, 28, 28) save_image(utils.denorm(images), os.path.join(sample_dir, 'real_images.png')) fake_images = fake_images.reshape(b_size, 1, 28, 28) save_image( utils.denorm(fake_images), os.path.join(sample_dir, 'fake_images-{}.png'.format(epoch + 1))) torch.save(G.state_dict(), 'weights/G.ckpt') torch.save(D.state_dict(), 'weights/D.ckpt') plt.figure(figsize=(10, 5)) plt.title("Generator and Discriminator Loss During Training") plt.plot(g_losses, label="Generator") plt.plot(d_losses, label="Discriminator") plt.xlabel("iterations") plt.ylabel("Loss") plt.legend() plt.savefig(os.path.join(sample_dir, 'loss.png'))
model = models.LandmarkNet(n_classes=params['class_topk'], model_name=params['model_name'], pooling=params['pooling'], loss_module=params['loss'], s=params['s'], margin=params['margin'], theta_zero=params['theta_zero'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ) num_GPU = len(devices.split(',')) > 1 in num_GPU>0: model = model.cuda() criterion = nn.CrossEntropyLoss() optimizer = utils.get_optim(params, model) if resume: #sdict = torch.load(ROOT + params['base_ckpt_path'])['state_dict'] #del sdict['final.weight'] # remove fully-connected layer #model.load_state_dict(sdict, strict=False) rets = load_checkpoint(resume, model=model, optimizer=None, params=False, epoach=True) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6) start_epoch, end_epoch = (0, params['epochs'] - params['scaleup_epochs']) start_epoch = rets['epoch'] for _ in range(start_epoch*len(data_loaders['train'])):
def train_mlt_single(args): global logger logger.info(args) task_lst, vocabs = utils.get_data(args.data_path) task_db = task_lst[args.task_id] train_data = task_db.train_set dev_data = task_db.dev_set test_data = task_db.test_set task_name = task_db.task_name if args.debug: train_data = train_data[:200] dev_data = dev_data[:200] test_data = test_data[:200] args.epochs = 3 args.pruning_iter = 3 summary_writer = SummaryWriter( log_dir=os.path.join(args.tb_path, "global/%s" % task_name) ) logger.info("task name: {}, task id: {}".format(task_db.task_name, task_db.task_id)) logger.info( "train len {}, dev len {}, test len {}".format( len(train_data), len(dev_data), len(test_data) ) ) # init model model = get_model(args, task_lst, vocabs) logger.info("model: \n{}".format(model)) if args.init_weights is not None: utils.load_model(model, args.init_weights) if utils.need_acc(task_name): metrics = [AccuracyMetric(target="y"), MetricInForward(val_name="loss")] metric_key = "acc" else: metrics = [ YangJieSpanMetric( tag_vocab=vocabs[task_name], pred="pred", target="y", seq_len="seq_len", encoding_type="bioes" if task_name == "ner" else "bio", ), MetricInForward(val_name="loss"), ] metric_key = "f" logger.info(metrics) need_cut_names = list(set([s.strip() for s in args.need_cut.split(",")])) prune_names = [] for name, p in model.named_parameters(): if not p.requires_grad or "bias" in name: continue for n in need_cut_names: if n in name: prune_names.append(name) break # get Pruning class pruner = Pruning( model, prune_names, final_rate=args.final_rate, pruning_iter=args.pruning_iter ) if args.init_masks is not None: pruner.load(args.init_masks) pruner.apply_mask(pruner.remain_mask, pruner._model) # save checkpoint os.makedirs(args.save_path, exist_ok=True) logger.info('Saving init-weights to {}'.format(args.save_path)) torch.save( model.cpu().state_dict(), os.path.join(args.save_path, "init_weights.th") ) torch.save(args, os.path.join(args.save_path, "args.th")) # start training and pruning summary_writer.add_scalar("remain_rate", 100.0, 0) summary_writer.add_scalar("cutoff", 0.0, 0) if args.init_weights is not None: init_tester = Tester( test_data, model, metrics=metrics, batch_size=args.batch_size, num_workers=4, device="cuda", use_tqdm=False, ) res = init_tester.test() logger.info("No init testing, Result: {}".format(res)) del res, init_tester for prune_step in range(pruner.pruning_iter + 1): # reset optimizer every time optim_params = [p for p in model.parameters() if p.requires_grad] # utils.get_logger(__name__).debug(optim_params) utils.get_logger(__name__).debug(len(optim_params)) optimizer = get_optim(args.optim, optim_params) # optimizer = TriOptim(optimizer, args.n_filters, args.warmup, args.decay) factor = pruner.cur_rate / 100.0 factor = 1.0 # print(factor, pruner.cur_rate) for pg in optimizer.param_groups: pg["lr"] = factor * pg["lr"] utils.get_logger(__name__).info(optimizer) trainer = Trainer( train_data, model, loss=LossInForward(), optimizer=optimizer, metric_key=metric_key, metrics=metrics, print_every=200, batch_size=args.batch_size, num_workers=4, n_epochs=args.epochs, dev_data=dev_data, save_path=None, sampler=fastNLP.BucketSampler(batch_size=args.batch_size), callbacks=[ pruner, # LRStep(lstm.WarmupLinearSchedule(optimizer, args.warmup, int(len(train_data)/args.batch_size*args.epochs))) GradientClipCallback(clip_type="norm", clip_value=5), LRScheduler( lr_scheduler=LambdaLR(optimizer, lambda ep: 1 / (1 + 0.05 * ep)) ), LogCallback(path=os.path.join(args.tb_path, "No", str(prune_step))), ], use_tqdm=False, device="cuda", check_code_level=-1, ) res = trainer.train() logger.info("No #{} training, Result: {}".format(pruner.prune_times, res)) name, val = get_metric(res) summary_writer.add_scalar("prunning_dev_acc", val, prune_step) tester = Tester( test_data, model, metrics=metrics, batch_size=args.batch_size, num_workers=4, device="cuda", use_tqdm=False, ) res = tester.test() logger.info("No #{} testing, Result: {}".format(pruner.prune_times, res)) name, val = get_metric(res) summary_writer.add_scalar("pruning_test_acc", val, prune_step) # prune and save torch.save( model.state_dict(), os.path.join( args.save_path, "best_{}_{}.th".format(pruner.prune_times, pruner.cur_rate), ), ) pruner.pruning_model() summary_writer.add_scalar("remain_rate", pruner.cur_rate, prune_step + 1) summary_writer.add_scalar("cutoff", pruner.last_cutoff, prune_step + 1) pruner.save( os.path.join( args.save_path, "{}_{}.th".format(pruner.prune_times, pruner.cur_rate) ) )
def job(tuning, params_path, devices, resume, save_interval): global params if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' exp_path = ROOT + f'experiments/{params["ex_name"]}/' os.environ['CUDA_VISIBLE_DEVICES'] = devices if resume is None: # C-AIRとABCIで整合性が取れるようにしている。 params[ 'base_ckpt_path'] = f'experiments/v1only/ep4_augmentation-soft_epochs-5_loss-{params["loss"]}.pth' params[ 'clean_path'] = ROOT + f'input/clean/train19_cleaned_verifythresh{params["verifythresh"]}_freqthresh{params["freqthresh"]}.csv' else: params = utils.load_checkpoint(path=resume, params=True)['params'] logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') if params['augmentation'] == 'soft': params['scale_limit'] = 0.2 params['brightness_limit'] = 0.1 elif params['augmentation'] == 'middle': params['scale_limit'] = 0.3 params['shear_limit'] = 4 params['brightness_limit'] = 0.1 params['contrast_limit'] = 0.1 else: raise ValueError train_transform, eval_transform = data_utils.build_transforms( scale_limit=params['scale_limit'], shear_limit=params['shear_limit'], brightness_limit=params['brightness_limit'], contrast_limit=params['contrast_limit'], ) data_loaders = data_utils.make_train_loaders( params=params, data_root=ROOT + 'input/' + params['data'], train_transform=train_transform, eval_transform=eval_transform, scale='SS2', test_size=0, class_topk=params['class_topk'], num_workers=8) model = models.LandmarkNet( n_classes=params['class_topk'], model_name=params['model_name'], pooling=params['pooling'], loss_module=params['loss'], s=params['s'], margin=params['margin'], theta_zero=params['theta_zero'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ).cuda() criterion = nn.CrossEntropyLoss() optimizer = utils.get_optim(params, model) if resume is None: sdict = torch.load(ROOT + params['base_ckpt_path'])['state_dict'] if params['loss'] == 'adacos': del sdict['final.W'] # remove fully-connected layer elif params['loss'] == 'softmax': del sdict['final.weight'], sdict[ 'final.bias'] # remove fully-connected layer else: del sdict['final.weight'] # remove fully-connected layer model.load_state_dict(sdict, strict=False) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6) start_epoch, end_epoch = (0, params['epochs'] - params['scaleup_epochs']) else: ckpt = utils.load_checkpoint(path=resume, model=model, optimizer=optimizer, epoch=True) model, optimizer, start_epoch = ckpt['model'], ckpt[ 'optimizer'], ckpt['epoch'] + 1 end_epoch = params['epochs'] scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6, last_epoch=start_epoch * len(data_loaders['train'])) setting += 'scaleup_' + resume.split('/')[-1].replace('.pth', '') data_loaders = data_utils.make_verified_train_loaders( params=params, data_root=ROOT + 'input/' + params['data'], train_transform=train_transform, eval_transform=eval_transform, scale='M2', test_size=0, num_workers=8) batch_norm.freeze_bn(model) if len(devices.split(',')) > 1: model = nn.DataParallel(model) for epoch in range(start_epoch, end_epoch): logger.info(f'Epoch {epoch}/{end_epoch}') # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (_, x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') outputs = model(x, y) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) if i % 100 == 99: logger.info( f'{epoch+i/len(data_loaders["train"]):.2f}epoch | {setting} acc: {prec1.avg}' ) train_loss = losses.avg train_acc = prec1.avg writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if (epoch + 1) == end_epoch or (epoch + 1) % save_interval == 0: output_file_name = exp_path + f'ep{epoch}_' + setting + '.pth' utils.save_checkpoint(path=output_file_name, model=model, epoch=epoch, optimizer=optimizer, params=params) model = model.module datasets = ('oxford5k', 'paris6k', 'roxford5k', 'rparis6k') results = eval_datasets(model, datasets=datasets, ms=True, tta_gem_p=1.0, logger=logger) if tuning: tuning_result = {} for d in datasets: if d in ('oxford5k', 'paris6k'): tuning_result[d] = results[d] else: for key in ['mapE', 'mapM', 'mapH']: mapE, mapM, mapH, mpE, mpM, mpH, kappas = results[d] tuning_result[d + '-' + key] = [eval(key)] utils.write_tuning_result(params, tuning_result, exp_path + 'tuning/results.csv')
def job(tuning, params_path, devices, resume, save_interval): global params if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' exp_path = ROOT + f'experiments/{params["ex_name"]}/' os.environ['CUDA_VISIBLE_DEVICES'] = devices logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') train_transform, eval_transform = build_transforms( scale_range=params['scale_range'], brightness_range=params['brightness_range']) data_loaders = data_utils.make_train_loaders( params=params, data_root=ROOT + 'input/train2018', train_transform=train_transform, eval_transform=eval_transform, class_topk=params['class_topk'], num_workers=8) model = models.LandmarkFishNet( n_classes=params['class_topk'], model_name=params['model_name'], pooling_strings=params['pooling'].split(','), loss_module='arcface', s=30.0, margin=params['margin'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ).cuda() optimizer = utils.get_optim(params, model) criterion = nn.CrossEntropyLoss() scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=1e-6) if len(devices.split(',')) > 1: model = nn.DataParallel(model) if resume is not None: model, optimizer = utils.load_checkpoint(path=resume, model=model, optimizer=optimizer) for epoch in range(params['epochs']): logger.info( f'Epoch {epoch}/{params["epochs"]} | lr: {optimizer.param_groups[0]["lr"]}' ) # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (_, x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') outputs = model(x, y) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) if i % 100 == 99: logger.info( f'{epoch+i/len(data_loaders["train"]):.2f}epoch | {setting} acc: {prec1.avg}' ) train_loss = losses.avg train_acc = prec1.avg # ============================== validation ============================== # model.train(False) losses.reset() prec1.reset() for i, (_, x, y) in tqdm(enumerate(data_loaders['val']), total=len(data_loaders['val']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') with torch.no_grad(): outputs = model(x, y) loss = criterion(outputs, y) acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) val_loss = losses.avg val_acc = prec1.avg logger.info(f'[Val] Loss: \033[1m{val_loss:.4f}\033[0m | ' f'Acc: \033[1m{val_acc:.4f}\033[0m\n') writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalars('Loss', {'val': val_loss}, epoch) writer.add_scalars('Acc', {'val': val_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if save_interval > 0: if (epoch + 1) == params['epochs'] or (epoch + 1) % save_interval == 0: output_file_name = exp_path + f'ep{epoch}_' + setting + '.pth' utils.save_checkpoint(path=output_file_name, model=model, epoch=epoch, optimizer=optimizer, params=params) if tuning: tuning_result = {} for key in ['train_loss', 'train_acc', 'val_loss', 'val_acc']: tuning_result[key] = [eval(key)] utils.write_tuning_result(params, tuning_result, exp_path + 'tuning/results.csv')
def train(dataset): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') sample_dir = os.path.join('samples', dataset) weights_dir = os.path.join('weights', dataset) os.makedirs(sample_dir, exist_ok=True) os.makedirs(weights_dir, exist_ok=True) transforms_ = [ transforms.Resize(int(params['img_height'] * 1.12), Image.BICUBIC), # 短い方の辺をsizeにする, 比率はそのまま transforms.RandomCrop((params['img_height'],['img_width'])), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] # DataLoader data_loader = torch.utils.data.DataLoader( ImageDataset(os.path.join('data', dataset), transforms_=transforms_, unaligned=True), batch_size=params['batch_size'], shuffle=True ) val_data_loader = torch.utils.data.DataLoader( ImageDataset(os.path.join('data', dataset), transforms_=transforms_, unaligned=True, mode='test'), batch_size=5, shuffle=True ) # Models D_A = models.Discriminator(params['channels']) D_B = models.Discriminator(params['channels']) G_AB = models.Generator(params['channels'], params['n_residual_blocks']) G_BA = models.Generator(params['channels'], params['n_residual_blocks']) D_A = D_A.to(device) D_B = D_B.to(device) G_AB = G_AB.to(device) G_BA = G_BA.to(device) # initialize models parameters D_A.apply(utils.weights_init) D_B.apply(utils.weights_init) G_AB.apply(utils.weights_init) G_BA.apply(utils.weights_init) # Losses criterion_GAN = nn.MSELoss() criterion_cycle = nn.L1Loss() criterion_identity = nn.L1Loss() # Optimizer # Generatorは同時に最適化を行う optimizer_G = utils.get_optim( params, itertools.chain(G_AB.parameters(),G_BA.parameters()), ) optimizer_D_A = utils.get_optim(params, D_A) optimizer_D_B = utils.get_optim(params, D_B) # learning rate schedulers lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( optimizer_G, lr_lambda=utils.LambdaLR(params['epochs'], decay_start_epoch=params['decay_epoch']).step ) lr_scheduler_D_A = torch.optim.lr_scheduler.LambdaLR( optimizer_D_A, lr_lambda=utils.LambdaLR(params['epochs'], decay_start_epoch=params['decay_epoch']).step ) lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR( optimizer_D_B , lr_lambda=utils.LambdaLR(params['epochs'], decay_start_epoch=params['decay_epoch']).step ) # Buffers of previously generated samples fake_A_buffer = utils.ReplayBuffer() fake_B_buffer = utils.ReplayBuffer() def sample_images(epochs): """Saves a generated sample from the test set""" imgs = next(iter(val_data_loader)) G_AB.eval() G_BA.eval() with torch.no_grad(): real_A = imgs["A"].to(device) fake_B = G_AB(real_A) real_B = imgs["B"].to(device) fake_A = G_BA(real_B) # Arange images along x-axis real_A = make_grid(real_A, nrow=5, normalize=True) real_B = make_grid(real_B, nrow=5, normalize=True) fake_A = make_grid(fake_A, nrow=5, normalize=True) fake_B = make_grid(fake_B, nrow=5, normalize=True) # Arange images along y-axis image_grid = torch.cat((real_A, fake_B, real_B, fake_A), 1) save_image(image_grid, os.path.join(samples_dir,"fake_images-%s.png" % (epochs)), normalize=False) losses_D = [] losses_G = [] total_step = len(data_loader) for epoch in range(params['epochs']): for i, images in enumerate(data_loader): real_A = images['A'].to(device) real_B = images['B'].to(device) b_size = real_A.size(0) # TODO: require_grad, G.train(), 自動化できないか real_labels = torch.ones((b_size, 1, 16, 16)).to(device) fake_labels = torch.zeros((b_size, 1, 16, 16)).to(device) # Train Generator optimizer_G.zero_grad() # GAN loss fake_B = G_AB(real_A) loss_GAN_AB = criterion_GAN(D_B(fake_B), real_labels) fake_A = G_BA(real_B) loss_GAN_BA = criterion_GAN(D_A(fake_A), real_labels) loss_GAN = (loss_GAN_AB + loss_GAN_BA) / 2 # Cycle loss recov_A = G_BA(fake_B) loss_cycle_A = criterion_cycle(recov_A, real_A) recov_B = G_AB(fake_A) loss_cycle_B = criterion_cycle(recov_B, real_B) loss_cycle = (loss_cycle_A + loss_cycle_B) / 2 # Total loss loss_G = loss_GAN + params['lambda_cyc'] * loss_cycle loss_G.backward() optimizer_G.step() # Train discriminator A optimizer_D_A.zero_grad() loss_real_A = criterion_GAN(D_A(real_A), real_labels) fake_A_ = fake_A_buffer.push_and_pop(fake_A) loss_fake_A = criterion_GAN(D_A(fake_A_.detach()), fake_labels) loss_D_A = (loss_real_A + loss_fake_A) / 2 loss_D_A.backward() optimizer_D_A.step() # Train discriminator B optimizer_D_B.zero_grad() loss_real_B = criterion_GAN(D_B(real_B), real_labels) fake_B_ = fake_A_buffer.push_and_pop(fake_B) loss_fake_B = criterion_GAN(D_B(fake_B_.detach()), fake_labels) loss_D_B = (loss_real_B + loss_fake_B) / 2 loss_D_B.backward() optimizer_D_B.step() loss_D = (loss_D_A + loss_D_B) / 2 print('Epoch [{}/{}], step [{}/{}], loss_D: {:.4f}, loss_G: {:.4f}, D_A(x): {:.2f}, D_A(G_BA(z)): {:.2f}, D_B(x): {:.2f}, D_B(G_AB(z)): {:.2f}' .format(epoch, params['epochs'], i + 1, total_step, loss_D.item(), loss_G.item(), loss_real_A.mean().item(), loss_real_B.mean().item(), loss_real_B.mean().item(), loss_real_B.mean().item())) losses_G.append(loss_G.item()) losses_D.append(loss_D.item()) if epoch % params['checkpoint_interval'] == 0: torch.save(G_AB.state_dict(), os.path.join(weights_dir, 'G_AB.ckpt')) torch.save(G_BA.state_dict(), os.path.join(weights_dir, 'G_BA.ckpt')) torch.save(D_A.state_dict(), os.path.join(weights_dir, 'D_A.ckpt')) torch.save(D_B.state_dict(), os.path.join(weights_dir, 'D_B.ckpt')) # if (epoch + 1) == 1: # save_image(utils.denorm(images), os.path.join( # sample_dir, 'real_images.png')) sample_images(epoch + 1) plt.figure(figsize=(10, 5)) plt.title("Generator and Discriminator Loss During Training") plt.plot(g_losses, label="Generator") plt.plot(d_losses, label="Discriminator") plt.xlabel("iterations") plt.ylabel("Loss") plt.legend() plt.savefig(os.path.join(sample_dir, 'loss.png'))
def job(tuning, params_path, devices, resume, save_interval): global params if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' exp_path = os.path.join(dataset_connector.result_dir, f'{params["ex_name"]}/') os.environ['CUDA_VISIBLE_DEVICES'] = devices print("CUDA Available:", torch.cuda.is_available(), "CUDA_VISIBLE_DEVICES:", os.environ['CUDA_VISIBLE_DEVICES']) logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') if params['augmentation'] == 'soft': params['scale_limit'] = 0.2 params['brightness_limit'] = 0.1 elif params['augmentation'] == 'middle': params['scale_limit'] = 0.3 params['shear_limit'] = 4 params['brightness_limit'] = 0.1 params['contrast_limit'] = 0.1 else: raise ValueError train_transform, eval_transform = data_utils.build_transforms( scale_limit=params['scale_limit'], shear_limit=params['shear_limit'], brightness_limit=params['brightness_limit'], contrast_limit=params['contrast_limit'], ) data_loaders = data_utils.make_train_loaders( params=params, data_root=ROOT + 'input/' + params['train_data'], use_clean_version=True, train_transform=train_transform, eval_transform=eval_transform, scale='S2', test_size=0.1, num_workers=os.cpu_count() * 2) model = models.LandmarkNet( n_classes=params['class_topk'], model_name=params['model_name'], pooling=params['pooling'], loss_module=params['loss'], s=params['s'], margin=params['margin'], theta_zero=params['theta_zero'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ).cuda() criterion = nn.CrossEntropyLoss() optimizer = utils.get_optim(params, model) # TODO: Missing initial weight file. # sdict = torch.load(ROOT + params['base_ckpt_path'])['state_dict'] # del sdict['final.weight'] # remove fully-connected layer # model.load_state_dict(sdict, strict=False) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6) start_epoch, end_epoch = (0, params['epochs'] - params['scaleup_epochs']) if len(devices.split(',')) > 1: model = nn.DataParallel(model) for epoch in range(start_epoch, end_epoch): logger.info(f'Epoch {epoch}/{end_epoch}') # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (_, x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') outputs = model(x, y) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) logger.info("Training Loss:{},Accuracy(Prec1):{}".format( loss.item(), acc)) if i % 100 == 99: logger.info( f'{epoch + i / len(data_loaders["train"]):.2f}epoch | {setting} acc: {prec1.avg}' ) train_loss = losses.avg train_acc = prec1.avg writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if (epoch + 1) == end_epoch or (epoch + 1) % save_interval == 0: output_file_name = exp_path + f'ep{epoch}_' + setting + '.pth' utils.save_checkpoint(path=output_file_name, model=model, epoch=epoch, optimizer=optimizer, params=params) model = model.module datasets = ('oxford5k', 'paris6k', 'roxford5k', 'rparis6k') results = eval_datasets(model, datasets=datasets, ms=True, tta_gem_p=1.0, logger=logger) if tuning: tuning_result = {} for d in datasets: if d in ('oxford5k', 'paris6k'): tuning_result[d] = results[d] else: for key in ['mapE', 'mapM', 'mapH']: mapE, mapM, mapH, mpE, mpM, mpH, kappas = results[d] tuning_result[d + '-' + key] = [eval(key)] utils.write_tuning_result(params, tuning_result, exp_path + 'tuning/results.csv')
def train(dataset, data): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') sample_dir = os.path.join('samples', dataset) weights_dir = os.path.join('weights', dataset) os.makedirs(sample_dir, exist_ok=True) os.makedirs(weights_dir, exist_ok=True) if dataset == 'mnist': dataset = torchvision.datasets.MNIST( root=data, download=True, train=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( # [0,1] -> [-1, 1] (0.5, ), (0.5, )), ])) params['nc'] = 1 data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=params['batch_size'], shuffle=True) D = models.Discriminator(params['ndf'], params['image_size'], params['labels']) G = models.Generator(params['nz'], params['ngf'], params['image_size'], params['labels']) D = D.to(device) G = G.to(device) criterion = nn.BCELoss() d_optimizer = utils.get_optim(params, D) g_optimizer = utils.get_optim(params, G) d_losses = [] g_losses = [] total_step = len(data_loader) for epoch in range(params['epochs']): for i, (images, labels) in enumerate(data_loader): b_size = images.size(0) images = images.reshape(b_size, -1).to(device) labels = labels.to(device) real_labels = torch.ones(b_size).to(device) fake_labels = torch.zeros(b_size).to(device) random_labels = torch.LongTensor(np.random.randint( 0, 10, b_size)).to(device) # Train discriminator outputs = D(images, labels) d_loss_real = criterion(outputs, real_labels) real_score = outputs z = torch.randn(b_size, params['nz']).to(device) fake_images = G(z, random_labels) outputs = D(fake_images.detach(), random_labels) d_loss_fake = criterion(outputs, fake_labels) fake_score = outputs d_loss = d_loss_real + d_loss_fake d_optimizer.zero_grad() d_loss.backward() d_optimizer.step() # Train generator # ランダムなラベルを再定義 random_labels = torch.LongTensor(np.random.randint( 0, 10, b_size)).to(device) fake_images = G(z, random_labels) outputs = D(fake_images, random_labels) g_loss = criterion(outputs, real_labels) g_optimizer.zero_grad() g_loss.backward() g_optimizer.step() print( 'Epoch [{}/{}], step [{}/{}], d_loss: {:.4f}, g_loss: {:.4f}, D(x): {:.2f}, D(G(z)): {:.2f}' .format( epoch, params['epochs'], i + 1, total_step, d_loss.item(), g_loss.item(), real_score.mean().item(), fake_score.mean().item())) # .item():ゼロ次元Tensorから値を取り出す g_losses.append(g_loss.item()) d_losses.append(d_loss.item()) if (epoch + 1) == 1: images = images.reshape(b_size, 1, 28, 28) save_image(utils.denorm(images), os.path.join(sample_dir, 'real_images.png')) fake_images = fake_images.reshape(b_size, 1, 28, 28) save_image( utils.denorm(fake_images), os.path.join(sample_dir, 'fake_images-{}.png'.format(epoch + 1))) torch.save(G.state_dict(), os.path.join(weights_dir, 'G.ckpt')) torch.save(D.state_dict(), os.path.join(weights_dir, 'D.ckpt')) plt.figure(figsize=(10, 5)) plt.title("Generator and Discriminator Loss During Training") plt.plot(g_losses, label="Generator") plt.plot(d_losses, label="Discriminator") plt.xlabel("iterations") plt.ylabel("Loss") plt.legend() plt.savefig(os.path.join(sample_dir, 'loss.png'))
def job(tuning, params_path, devices, resume): """ Example: python exp0.py job --devices 0,1 -s python exp0.py tuning --devices 0,1 --n-gpu 1 --mode 'random' --n-iter 4 """ exp_path = ROOT + f'experiments/{params["ex_name"]}/' os.environ['CUDA_VISIBLE_DEVICES'] = devices global params if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') train_df = pd.read_csv(ROOT + 'data/train.csv') train_df, val_df = train_test_split(train_df, test_size=1024, random_state=params['seed']) model = models.UNet(in_channels=3, n_classes=2, depth=4, ch_first=32, padding=True, batch_norm=False, up_mode='upconv').cuda() optimizer = utils.get_optim(model, params) if resume is not None: model, optimizer = utils.load_checkpoint(model, resume, optimizer=optimizer) if len(devices.split(',')) > 1: model = nn.DataParallel(model) data_transforms = { 'train': transforms.Compose([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]), 'val': transforms.Compose([ transforms.ToPILImage(), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]), } image_datasets = { 'train': data_utils.CSVDataset(train_df, data_transforms['train']), 'val': data_utils.CSVDataset(val_df, data_transforms['val']) } data_loaders = { 'train': DataLoader(image_datasets['train'], batch_size=params['batch_size'], pin_memory=True, shuffle=True, drop_last=True, num_workers=params['workers']), 'val': DataLoader(image_datasets['val'], batch_size=params['test_batch_size'], pin_memory=True, shuffle=False, num_workers=params['workers']) } criterion = nn.CrossEntropyLoss() scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[int(params['epochs'] * 0.7), int(params['epochs'] * 0.9)], gamma=0.1) for epoch in range(params['epochs']): logger.info( f'Epoch {epoch}/{params["epochs"]} | lr: {optimizer.param_groups[0]["lr"]}' ) # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=50): x = x.to('cuda:0') y = y.to('cuda:0', non_blocking=True) outputs = model(x) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() acc = utils.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc.item(), x.size(0)) train_loss = losses.avg train_acc = prec1.avg # ============================== validation ============================== # model.train(False) losses.reset() prec1.reset() for i, (x, y) in tqdm(enumerate(data_loaders['val']), total=len(data_loaders['val'])): x = x.cuda() y = y.cuda(non_blocking=True) with torch.no_grad(): outputs = model(x) loss = criterion(outputs, y) acc = utils.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc.item(), x.size(0)) val_loss = losses.avg val_acc = prec1.avg logger.info(f'[Val] Loss: \033[1m{val_loss:.4f}\033[0m | ' f'Acc: \033[1m{val_acc:.4f}\033[0m\n') writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalars('Loss', {'val': val_loss}, epoch) writer.add_scalars('Acc', {'val': val_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) scheduler.step() if not tuning: utils.save_checkpoint(model, epoch, exp_path + 'model_optim.pth', optimizer) if tuning: tuning_result = {} for key in ['train_loss', 'train_acc', 'val_loss', 'val_acc']: tuning_result[key] = [eval(key)] utils.write_tuning_result(params, tuning_result, exp_path + 'tuning/results.csv')
logger.info("Model parameters:") params = list(model.named_parameters()) sum_param = 0 for name, param in params: if param.requires_grad: logger.info("{}: {}".format(name, param.shape)) sum_param += param.numel() logger.info("# Parameters: {}.".format(sum_param)) masker.to("cuda" if torch.cuda.is_available() else "cpu") Trainer = get_trainer_cls(args) if not args.evaluate: logger.info("========== Training Model ==========") base_params = filter(lambda p: p.requires_grad, model.parameters()) opt = utils.get_optim(args.optim, base_params) logger.info(opt) trainer = Trainer(masker, task_lst, vocabs, opt, args) trainer.train(args.epochs) logger.info("========== Testing Model ==========") trainer.model = utils.load_model( model, os.path.join(args.save_path, "best.th")) test_loss, test_acc = trainer._eval_epoch(dev=False) logger.info(args.exp_name) for acc in test_acc.items(): logger.info(acc) else: logger.info("========== Evaluating Model ==========")
def job(tuning, params_path, devices, resume, save_interval): global params for loss_input in [ "Softmax", "arcface", "cosface", "AdditiveMarginSoftmaxLoss" ]: params["loss"] = loss_input if tuning: with open(params_path, 'r') as f: params = json.load(f) mode_str = 'tuning' setting = '_'.join(f'{tp}-{params[tp]}' for tp in params['tuning_params']) else: mode_str = 'train' setting = '' exp_path = os.path.join(dataset_connector.result_dir, f'{params["ex_name"]}/', f'{params["loss"]}/') os.environ['CUDA_VISIBLE_DEVICES'] = devices print("CUDA Available:", torch.cuda.is_available(), "CUDA_VISIBLE_DEVICES:", os.environ['CUDA_VISIBLE_DEVICES']) logger, writer = utils.get_logger( log_dir=exp_path + f'{mode_str}/log/{setting}', tensorboard_dir=exp_path + f'{mode_str}/tf_board/{setting}') if params['augmentation'] == 'soft': params['scale_limit'] = 0.2 params['brightness_limit'] = 0.1 elif params['augmentation'] == 'middle': params['scale_limit'] = 0.3 params['shear_limit'] = 4 params['brightness_limit'] = 0.1 params['contrast_limit'] = 0.1 else: raise ValueError train_transform, eval_transform = data_utils.build_transforms( scale_limit=params['scale_limit'], shear_limit=params['shear_limit'], brightness_limit=params['brightness_limit'], contrast_limit=params['contrast_limit'], ) data_loaders = data_utils.make_train_loaders( params=params, data_root=ROOT + 'input/' + params['train_data'], use_clean_version=True, train_transform=train_transform, eval_transform=eval_transform, scale='S2', test_size=0.1, num_workers=os.cpu_count() * 2) model = models.LandmarkNet( n_classes=params['class_topk'], model_name=params['model_name'], pooling=params['pooling'], loss_module=params['loss'], s=params['s'], margin=params['margin'], theta_zero=params['theta_zero'], use_fc=params['use_fc'], fc_dim=params['fc_dim'], ).cuda() criterion = nn.CrossEntropyLoss() optimizer = utils.get_optim(params, model) state_dict = torch.load( os.path.join("../pretrained_model", "Epoch14_" + loss_input + ".pth")) clean_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] clean_state_dict[name] = v # sdict = torch.load(resume)['state_dict'] # del sdict['final.weight'] # remove fully-connected layer # model.load_state_dict(sdict, strict=False) # model.backbone.requires_grad = False # model.bn.requires_grad = False # model.dropout.requires_grad = False # model.fc.requires_grad = False scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=params['epochs'] * len(data_loaders['train']), eta_min=3e-6) start_epoch, end_epoch = (15, params['epochs'] - params['scaleup_epochs']) if len(devices.split(',')) > 1: model = nn.DataParallel(model) for epoch in range(start_epoch, end_epoch): logger.info(f'Epoch {epoch}/{end_epoch}') # ============================== train ============================== # model.train(True) losses = utils.AverageMeter() prec1 = utils.AverageMeter() for i, (_, x, y) in tqdm(enumerate(data_loaders['train']), total=len(data_loaders['train']), miniters=None, ncols=55): x = x.to('cuda') y = y.to('cuda') if params["loss"] in ["AdditiveMarginSoftmaxLoss"]: outputs, loss = model(x, y) loss = loss.mean() elif params["loss"] in ["LSoftmax", "arcface", "cosface"]: outputs = model(x, y) loss = criterion(outputs, y) elif params["loss"] in ["Softmax"]: outputs = model(x) loss = criterion(outputs, y) break optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() acc = metrics.accuracy(outputs, y) losses.update(loss.item(), x.size(0)) prec1.update(acc, x.size(0)) logger.info("Training Loss:{},Accuracy(Prec1):{}".format( loss.item(), acc)) if i % 100 == 99: logger.info( f'{epoch + i / len(data_loaders["train"]):.2f}epoch | {setting} acc: {prec1.avg}' ) break train_loss = losses.avg train_acc = prec1.avg writer.add_scalars('Loss', {'train': train_loss}, epoch) writer.add_scalars('Acc', {'train': train_acc}, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if (epoch + 1) == end_epoch or (epoch + 1) % save_interval == 0: output_file_name = exp_path + f'Epoch{epoch}_' + str( params["loss"]) + setting + '.pth' print("Model Saved:{}".format(output_file_name)) utils.save_checkpoint(path=output_file_name, model=model, epoch=epoch, optimizer=optimizer, params=params)