def train(self, model, data_loader, validation_loader, tb=None, epochs=20, log_interval=100, checkpoint_interval=100): optimizer = AdamW(model.parameters(), lr=6e-4) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3000, num_training_steps=epochs * len(data_loader)) for epoch in range(epochs): model.train() total_mle_loss = 0.0 n_word_total = 0.0 n_word_correct = 0.0 for batch_idx, batch in enumerate( tqdm(data_loader, mininterval=2, leave=False)): batch_xs, batch_ys = map(lambda x: x.to(self.device), batch) trg_ys = batch_ys[:, 1:] pred_logits = model(input_ids=batch_xs, decoder_input_ids=batch_ys[:, :-1]) pred_logits = pred_logits.contiguous().view( -1, pred_logits.size(2)) # pred_logits = pred_logits.reshape(-1, pred_logits.size(2)) loss, n_correct, n_total = self.compute_mle_loss( pred_logits, trg_ys, smoothing=True) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() total_mle_loss += loss.item() optimizer.zero_grad() # non_pad_mask = trg_ys.ne(PAD_IDX) # n_word = non_pad_mask.sum().item() n_word_total += n_total n_word_correct += n_correct if tb is not None and batch_idx % log_interval == 0: tb_mle_batch(tb, total_mle_loss, n_word_total, n_word_correct, epoch, batch_idx, len(data_loader)) total_mle_loss = 0.0 n_word_total = 0.0 n_word_correct = 0.0 if batch_idx != 0 and batch_idx % checkpoint_interval == 0: pred_max = pred_logits.reshape(-1, 127, len(idx2word)).max(2)[1] pred = pd.DataFrame(pred_max.to('cpu').numpy()) pred_words = np.where(pred.isin(idx2word.keys()), pred.replace(idx2word), UNKNOWN_WORD) trg_ys = pd.DataFrame(batch_ys[:, 1:].to('cpu').numpy()) trg_words = np.where(trg_ys.isin(idx2word.keys()), trg_ys.replace(idx2word), UNKNOWN_WORD) with open('output_tests.txt', 'a') as f: f.write("On the iteration %d" % batch_idx) f.write("The actual line:\n") f.write(str(trg_words[0])) f.write("The prediciton of the line:\n") f.write(str(pred_words[0])) f.write('\n\n\n\n\n') save_checkpoint(epoch, model, optimizer, scheduler, suffix=str(batch_idx)) loss_per_word = total_mle_loss / n_word_total accuracy = n_word_correct / n_word_total # if tb is not None: # tb_mle_epoch(tb, loss_per_word, accuracy, epoch) # self.validate_BLEU(model, deepcopy(validation_loader), epoch, tb) # for batch_idx, batch in enumerate(tqdm(validation_loader, mininterval=2, leave=False)): # with torch.no_grad(): # batch_xs, batch_ys = map(lambda x: x.to(self.device), batch) # trg_ys = pd.DataFrame(batch_ys[:, 1:].to('cpu').numpy()) # pred = model(batch_xs, batch_ys[:, :-1]) # pred_max = pred.to('cpu').max(2)[1] # pred = pd.DataFrame(pred_max.numpy()) # pred_words = np.where(pred.isin(idx2word.keys()), pred.replace(idx2word), UNKNOWN_WORD) # trg_words = np.where(trg_ys.isin(idx2word.keys()), trg_ys.replace(idx2word), UNKNOWN_WORD) # print(pred_words[0]) # print(trg_words[0]) # break
def train_sim(epoch_num=10, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, l2_penalty=0.0, momentum=0.0, log=False, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='CIFAR10', device='cpu', gpu_num=1): lr_d = 1e-4 lr_g = 1e-4 dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) optim_d = RMSprop(D.parameters(), lr=lr_d) optim_g = RMSprop(G.parameters(), lr=lr_g) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optim_d.load_state_dict(chk['d_optim']) optim_g.load_state_dict(chk['g_optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) D.zero_grad() G.zero_grad() loss.backward() optim_d.step() optim_g.step() if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint( path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optim_d, g_optimizer=optim_g) if wandb and log: wandb.log({ 'Real score': d_real.mean().item(), 'Fake score': d_fake.mean().item(), 'Loss': loss.item() }) count += 1
def train(model, optimizer, train_loader, valid_loader, save_path, criterion, num_epochs=50, eval_every=50, best_valid_loss=float("Inf"), model_name="model"): # initialize running values running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] # training loop print("Start training for", num_epochs, "epochs...") model.float() model.train() for epoch in range(num_epochs): print("Epoch", epoch + 1, "of", num_epochs) for train_batch in train_loader: labels = train_batch['binary_label'].unsqueeze(1).to(device) content = train_batch['content'] content = torch.stack(content, dim=1).to(device) output = model(content).unsqueeze(1).to(device) loss = criterion(output, labels.float()) optimizer.zero_grad() loss.backward() optimizer.step() # update running values running_loss += loss.item() global_step += 1 # evaluation step if global_step % eval_every == 0: model.eval() with torch.no_grad(): # validation loop for val_batch in valid_loader: labels = val_batch['binary_label'].unsqueeze(1).to( device) content = val_batch['content'] content = torch.stack(content, dim=1).to(device) output = model(content).unsqueeze(1).to(device) loss = criterion(output, labels.float()) valid_running_loss += loss.item() # evaluation average_train_loss = running_loss / eval_every average_valid_loss = valid_running_loss / len(valid_loader) train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # resetting running values running_loss = 0.0 valid_running_loss = 0.0 model.train() # print progress print( 'Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}' .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_loader), average_train_loss, average_valid_loss)) # checkpoint if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss save_checkpoint(save_path + model_name + '.pt', model, optimizer, best_valid_loss, log_file) save_metrics(save_path + model_name + '_metrics.pt', train_loss_list, valid_loss_list, global_steps_list, log_file) save_metrics(save_path + model_name + '_metrics.pt', train_loss_list, valid_loss_list, global_steps_list, log_file) print('Finished Training!')
if epoch % 10 == 0: checkpoint = { 'epoch': epoch, 'batch_size': batch_size, 'ME_state': me.state_dict(), 'CE_state': ce.state_dict(), 'Pred_state': predictor.state_dict(), 'CE_hidden_size': grid[iter]['hce_hidden'], 'Classifier_hidden_size': grid[iter]['pred_hidden'], 'dropout': grid[iter]['dropout'], 'optimME_state': optimME.state_dict(), 'optimCE_state': optimCE.state_dict(), 'optimPred_state': optimPred.state_dict() } save_checkpoint(checkpoint) # Check for early stopping if val_pred_loss < min_val_loss: epoch_no_improv = 0 min_val_loss = val_pred_loss else: epoch_no_improv += 1 if epoch_no_improv == es_patience: print('=> Early stopping!') break else: continue # Save trained model
def train_cgd(epoch_num=10, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, tols={ 'tol': 1e-10, 'atol': 1e-16 }, l2_penalty=0.0, momentum=0.0, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='CIFAR10', device='cpu', gpu_num=1, ada_train=True, log=False, collect_info=False, args=None): lr_d = args['lr_d'] lr_g = args['lr_g'] dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) if optim_type == 'BCGD': optimizer = BCGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, momentum=momentum, tol=tols['tol'], atol=tols['atol'], device=device) # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) elif optim_type == 'ICR': optimizer = ICR(max_params=G.parameters(), min_params=D.parameters(), lr=lr_d, alpha=1.0, device=device) # scheduler = icrScheduler(optimizer, milestone) elif optim_type == 'ACGD': optimizer = ACGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], device=device, solver='cg') # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) # optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) mod = 10 accs = torch.tensor([0.8 for _ in range(mod)]) for e in range(epoch_num): # scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) optimizer.zero_grad() optimizer.step(loss) num_correct = torch.sum(d_real > 0) + torch.sum(d_fake < 0) acc = num_correct.item() / (d_real.shape[0] + d_fake.shape[0]) accs[count % mod] = acc acc_indicator = sum(accs) / mod if acc_indicator > 0.9: ada_ratio = 0.05 elif acc_indicator < 0.80: ada_ratio = 0.1 else: ada_ratio = 1.0 if ada_train: optimizer.set_lr(lr_max=lr_g, lr_min=ada_ratio * lr_d) if count % show_iter == 0 and count != 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s_%d.pth' % (optim_type, model_name, count + start_n), D=D, G=G, optimizer=optimizer) if wandb and log: wandb.log( { 'Real score': d_real.mean().item(), 'Fake score': d_fake.mean().item(), 'Loss': loss.item(), 'Acc_indicator': acc_indicator, 'Ada ratio': ada_ratio }, step=count, ) if collect_info and wandb: cgd_info = optimizer.get_info() wandb.log( { 'CG iter num': cgd_info['iter_num'], 'CG runtime': cgd_info['time'], 'D gradient': cgd_info['grad_y'], 'G gradient': cgd_info['grad_x'], 'D hvp': cgd_info['hvp_y'], 'G hvp': cgd_info['hvp_x'], 'D cg': cgd_info['cg_y'], 'G cg': cgd_info['cg_x'] }, step=count) count += 1
else: cycle = int((epoch - args.epochs) // args.cycle_interval + 2) print('In %d -th cycle' % cycle) # do the fastSWA updates if args.fastswa_frequencies is not None: for fastswa_freq, fastswa_net, fastswa_opt in zip( fastswa_freqs, fastswa_nets, fastswa_optims, ): if epoch >= (args.epochs - args.cycle_interval) and ( epoch - args.epochs + args.cycle_interval) % fastswa_freq == 0: save_checkpoint(epoch, model, ema_model, swa_model, fastswa_nets[0], accuracy, args, path_checkpoint) print("Evaluate fast-swa-{} at epoch {}".format( fastswa_freq, epoch)) fastswa_opt.update(model) update_batchnorm(fastswa_net, trainloader) fastswa_acc = fastswa.test(testloader) accuracy['test_fastswa_acc'].append(fastswa_acc) else: accuracy['test_fastswa_acc'].append(None) # swa update if ((epoch >= args.epochs)) and ((epoch - args.epochs) % args.cycle_interval) == 0: swa_model_optim.update(model) print("SWA Model Updated!")
def main(): global args ## create models and optimizers print("=> creating models...") classifier = archs.resnet50shared(pretrained=True).to(device) decoder = archs.decoder(final_upsample_mode=args.upsample).to(device) optimizer = {} optimizer['classifier'] = torch.optim.SGD(classifier.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer['decoder'] = torch.optim.Adam(decoder.parameters(), args.lr_casme, weight_decay=args.weight_decay) cudnn.benchmark = True ## data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader(datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, sampler=None) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False) ## training loop for epoch in range(args.epochs): epoch_start_time = time.time() adjust_learning_rate(optimizer, epoch, args) ## train for one epoch tr_s = train_or_eval(train_loader, classifier, decoder, True, optimizer, epoch) ## evaluate on validation set val_s = train_or_eval(val_loader, classifier, decoder) ## save checkpoint save_checkpoint( { 'epoch': epoch + 1, 'state_dict_classifier': classifier.state_dict(), 'state_dict_decoder': decoder.state_dict(), 'optimizer_classifier': optimizer['classifier'].state_dict(), 'optimizer_decoder': optimizer['decoder'].state_dict(), 'args': args, }, args) ## log with open(args.log_path, 'a') as f: f.write( str(epoch + 1) + ' ' + str(time.time() - epoch_start_time) + ' ' + tr_s['acc'] + ' ' + val_s['acc'] + ' ' + tr_s['acc_m'] + ' ' + val_s['acc_m'] + ' ' + tr_s['avg_mask'] + ' ' + val_s['avg_mask'] + ' ' + tr_s['std_mask'] + ' ' + val_s['std_mask'] + ' ' + tr_s['entropy'] + ' ' + val_s['entropy'] + ' ' + tr_s['tv'] + ' ' + val_s['tv'] + '\n')
import argparse import train_utils parser = argparse.ArgumentParser( description='This script helps in training the model', ) parser.add_argument('--data_directory', dest='data_directory', action='store', default='./flowers') parser.add_argument('--model_name', dest='model_name', action='store', default='vgg16') parser.add_argument('--save_dir', dest='save_dir', action='store', default='checkpoint.pth') parser.add_argument('--learning_rate', dest='learning_rate', action='store', default=0.001, type=float) parser.add_argument('--hidden_input', dest='hidden_input', action='store', default=1024, type=int) parser.add_argument('--epochs', dest='epochs', action='store', default=5, type=int) parser.add_argument('--gpu', dest="mode", action="store", default="gpu") args = parser.parse_args() # fetch dataloaders train_data, train_dataloader, test_dataloader, validate_dataloader = train_utils.load_data(args.data_directory) # setup the classifier, criterion, optimizer model model, optimizer, criterion = train_utils.create_model( args.model_name, args.hidden_input, args.learning_rate, args.mode) # train model train_utils.train_model(model, optimizer, criterion, train_dataloader, validate_dataloader, args.epochs, args.mode) # save the model as checkpoint train_utils.save_checkpoint(model, args, optimizer, train_data)
def train_d(epoch_num=10, logdir='test', optim='SGD', loss_name='JSD', show_iter=500, model_weight=None, load_d=False, load_g=False, compare_path=None, info_time=100, run_select=None, device='cpu'): lr_d = 0.001 lr_g = 0.01 batchsize = 128 z_dim = 96 print('discriminator lr: %.3f' % lr_d) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) if compare_path is not None: discriminator = dc_D().to(device) model_weight = torch.load(compare_path) discriminator.load_state_dict(model_weight['D']) model_vec = torch.cat( [p.contiguous().view(-1) for p in discriminator.parameters()]) print('Load discriminator from %s' % compare_path) if run_select is not None: fixed_data = torch.load(run_select) real_set = fixed_data['real_set'] fake_set = fixed_data['fake_set'] real_d = fixed_data['real_d'] fake_d = fixed_data['fake_d'] fixed_vec = fixed_data['pred_vec'] print('load fixed data set') from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) if optim == 'SGD': d_optimizer = SGD(D.parameters(), lr=lr_d) print('Optimizer SGD') else: d_optimizer = BCGD2(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, update_max=False, device=device, collect_info=True) print('Optimizer BCGD2') timer = time.time() count = 0 d_losses = [] g_losses = [] for e in range(epoch_num): tol_correct = 0 tol_dloss = 0 tol_gloss = 0 for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((real_x.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) D_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) tol_dloss += D_loss.item() * real_x.shape[0] G_loss = get_loss(name=loss_name, g_loss=True, d_real=d_real, d_fake=d_fake) tol_gloss += G_loss.item() * fake_x.shape[0] if compare_path is not None and count % info_time == 0: diff = get_diff(net=D, model_vec=model_vec) writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count) if run_select is not None: with torch.no_grad(): d_real_set = D(real_set) d_fake_set = D(fake_set) diff_real = torch.norm(d_real_set - real_d, p=2) diff_fake = torch.norm(d_fake_set - fake_d, p=2) d_vec = torch.cat([d_real_set, d_fake_set]) diff = torch.norm(d_vec.sub_(fixed_vec), p=2) writer.add_scalars('L2 norm of pred difference', { 'Total': diff.item(), 'real set': diff_real.item(), 'fake set': diff_fake.item() }, global_step=count) d_optimizer.zero_grad() if optim == 'SGD': D_loss.backward() d_optimizer.step() gd = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in D.parameters()]), p=2) gg = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in G.parameters()]), p=2) else: d_optimizer.step(D_loss) cgdInfo = d_optimizer.get_info() gd = cgdInfo['grad_y'] gg = cgdInfo['grad_x'] writer.add_scalars('Grad', {'update': cgdInfo['update']}, global_step=count) tol_correct += (d_real > 0).sum().item() + (d_fake < 0).sum().item() writer.add_scalars('Loss', { 'D_loss': D_loss.item(), 'G_loss': G_loss.item() }, global_step=count) writer.add_scalars('Grad', { 'D grad': gd, 'G grad': gg }, global_step=count) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() save_checkpoint(path=logdir, name='FixG-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1 writer.close()
def run_lstm(learning_rate, batch_size, cuda, num_inputs, num_outputs, num_hidden, checkpoint_interval, total_batches, model_file): """ Train LSTM baseline. """ # Seeding SEED = 1000 torch.manual_seed(SEED) np.random.seed(SEED) # Model Loading if model_file == 'None': lstm = LSTM(num_inputs, num_hidden) if cuda: lstm.cuda() # Constants for keeping track total_examples = 0 losses = [] costs = [] seq_lens = [] else: from_before = torch.load(model_file) state_dict = from_before['state_dict'] num_inputs = from_before['num_inputs'] num_outputs = from_before['num_outputs'] batch_size = from_before['batch_size'] cuda = from_before['cuda'] lstm = LSTM(num_inputs, num_hidden) # Dataset creation training_dataset = random_binary(max_seq_length=20, num_sequences=200, vector_dim=8, batch_Size=batch_size) testing_dataset = random_binary(max_seq_length=10, num_sequences=50, vector_dim=8, batch_Size=batch_size) # Optimizer type and loss function optimizer = torch.optim.RMSprop(lstm.parameters(), lr=learning_rate, momentum=0.9) criterion = torch.nn.BCELoss() np.random.seed( SEED ) # reset training seed to ensure that batches remain the same between runs! for batch in training_dataset: lstm.init_hidden(batch_size) batch = Variable(batch) if cuda: batch = batch.cuda() optimizer.zero_grad() output = Variable(torch.zeros(batch.size())) if cuda: output = output.cuda() for i in range(batch.size()[2]): x = batch[:, :, i] output[:, :, i] = lstm.forward(x) # Output response x = Variable(torch.zeros(batch.size()[0:2])) if cuda: x = x.cuda() for i in range(batch.size()[2]): output[:, :, i] = lstm.forward(x) loss = criterion(output, batch) loss.backward() optimizer.step() print("Current Batch Loss:", round(loss.data[0], 3)) total_examples += batch_size # The cost is the number of error bits per sequence binary_output = output.clone().data binary_output = binary_output > 0.5 cost = torch.sum(torch.abs(binary_output.float() - batch.data)) losses += [loss.data[0]] costs += [cost / batch_size] seq_lens += [batch.size(2)] # Checkpoint model if (checkpoint_interval != 0) and (total_examples % checkpoint_interval == 0): print("Saving checkpoint!") save_checkpoint(lstm, total_examples / batch_size, losses, costs, seq_lens, total_examples, None, num_inputs, num_outputs, None, None, None, None, None, batch_size, cuda, num_hidden, 'LSTM') # Evaluate model on this saved checkpoint test_cost, prediction, input = evaluate_lstm_baseline( model=lstm, testset=testing_dataset, batch_size=batch_size, cuda=cuda) print("Total Test Cost (in bits per sequence):", test_cost) print("Example of Input/Output") print("prediction:", prediction[0]) print("Input:", input[0]) if total_examples / checkpoint_interval >= total_batches: break
def train_cgd(epoch_num=10, milestone=None, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, tols={ 'tol': 1e-10, 'atol': 1e-16 }, l2_penalty=0.0, momentum=0.0, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='CIFAR10', device='cpu', gpu_num=1, collect_info=False): lr_d = 0.01 lr_g = 0.01 dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) if optim_type == 'BCGD': optimizer = BCGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, momentum=momentum, tol=tols['tol'], atol=tols['atol'], device=device) scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) elif optim_type == 'ICR': optimizer = ICR(max_params=G.parameters(), min_params=D.parameters(), lr=lr_d, alpha=1.0, device=device) scheduler = icrScheduler(optimizer, milestone) elif optim_type == 'ACGD': optimizer = ACGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], device=device, solver='cg') scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) optimizer.zero_grad() optimizer.step(loss) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint( path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optimizer) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) writer.add_scalar('Loss', loss.item(), global_step=count) if collect_info: cgd_info = optimizer.get_info() writer.add_scalar('Conjugate Gradient/iter num', cgd_info['iter_num'], global_step=count) writer.add_scalar('Conjugate Gradient/running time', cgd_info['time'], global_step=count) writer.add_scalars('Delta', { 'D gradient': cgd_info['grad_y'], 'G gradient': cgd_info['grad_x'], 'D hvp': cgd_info['hvp_y'], 'G hvp': cgd_info['hvp_x'], 'D cg': cgd_info['cg_y'], 'G cg': cgd_info['cg_x'] }, global_step=count) count += 1 writer.close()
def run(learning_rate, batch_size, cuda, memory_feature_size, num_inputs, num_outputs, controller_size, controller_type, controller_layers, memory_size, integer_shift, checkpoint_interval, total_batches, model_file): # model_file = "checkpoints/ntm/copy-batch-5120.0--LSTM.model" # Seeding SEED = 1000 torch.manual_seed(SEED) np.random.seed(SEED) # Model Loading if model_file == 'None': ntm = NTM(num_inputs=num_inputs, num_outputs=num_outputs, controller_size=controller_size, controller_type=controller_type, controller_layers=controller_layers, memory_size=memory_size, memory_feature_size=memory_feature_size, integer_shift=integer_shift, batch_size=batch_size, use_cuda=cuda) # Constants for keeping track total_examples = 0 losses = [] costs = [] seq_lens = [] else: from_before = torch.load(model_file) state_dict = from_before['state_dict'] controller_type = from_before['controller_type'] num_inputs = from_before['num_inputs'] num_outputs = from_before['num_outputs'] controller_size = from_before['controller_size'] controller_layers = from_before['controller_layers'] memory_size = from_before['memory_size'] memory_feature_size = from_before['memory_feature_size'] integer_shift = from_before['integer_shift'] batch_size = from_before['batch_size'] cuda = from_before['cuda'] saved_biases = True ntm = NTM(num_inputs=num_inputs, num_outputs=num_outputs, controller_size=controller_size, controller_type=controller_type, controller_layers=controller_layers, memory_size=memory_size, memory_feature_size=memory_feature_size, integer_shift=integer_shift, batch_size=batch_size, use_cuda=cuda, saved_biases=saved_biases) ntm.load_state_dict(state_dict) losses = from_before['loss'] costs = from_before['cost'] seq_lens = from_before['seq_lengths'] total_examples = from_before['total_examples'] # Dataset creation training_dataset = random_binary(max_seq_length=20, num_sequences=500, vector_dim=8, batch_Size=batch_size) testing_dataset = random_binary(max_seq_length=10, num_sequences=50, vector_dim=8, batch_Size=batch_size) # Optimizer type and loss function # optimizer = torch.optim.Adam(ntm.parameters(), lr=learning_rate) optimizer = torch.optim.RMSprop(ntm.parameters(), lr=learning_rate, momentum=0.9, alpha=0.95) criterion = torch.nn.BCELoss() np.random.seed( SEED ) # reset training seed to ensure that batches remain the same between runs! for batch in training_dataset: optimizer.zero_grad() # Initialize head weights and memory to zero ntm.init_headweights() ntm.init_memory() batch = Variable(batch) if cuda: batch = batch.cuda() next_r = ntm.read_head.create_state(batch_size) if controller_type == 'LSTM': lstm_h, lstm_c = ntm.controller.create_state(batch_size) # Read batch in for i in range(batch.size()[2]): x = batch[:, :, i] if controller_type == 'LSTM': _, next_r, lstm_h, lstm_c = ntm.forward(x=x, r=next_r, lstm_h=lstm_h, lstm_c=lstm_c) elif controller_type == 'MLP': _, next_r = ntm.forward(x=x, r=next_r) # Output response x = Variable(torch.zeros(batch.size()[0:2])) output = Variable(torch.zeros(batch[:, :, :-1].size())) if cuda: x = x.cuda() output = output.cuda() for i in range(output.size()[2]): if controller_type == 'LSTM': output[:, :, i], next_r, lstm_h, lstm_c = ntm.forward(x=x, r=next_r, lstm_h=lstm_h, lstm_c=lstm_c) elif controller_type == 'MLP': output[:, :, i], next_r = ntm.forward(x=x, r=next_r) loss = criterion(output, batch[:, :, :-1]) loss.backward(retain_graph=True) optimizer.step() print("Current Batch Loss:", round(loss.data[0], 3)) total_examples += batch_size # The cost is the number of error bits per sequence binary_output = output.clone().data binary_output = binary_output > 0.5 cost = torch.sum( torch.abs(binary_output.float() - batch.data[:, :, :-1])) losses += [loss.data[0]] costs += [cost / batch_size] seq_lens += [batch.size(2)] # Checkpoint model if (checkpoint_interval != 0) and (total_examples % checkpoint_interval == 0): print("Saving Checkpoint!") save_checkpoint(ntm, total_examples / batch_size, losses, costs, seq_lens, total_examples, controller_type, num_inputs, num_outputs, controller_size, controller_layers, memory_size, memory_feature_size, integer_shift, batch_size, cuda) # Evaluate model on this saved checkpoint test_cost, prediction, input = evaluate( model=ntm, testset=testing_dataset, batch_size=batch_size, memory_feature_size=memory_feature_size, controller_type=controller_type, cuda=cuda) print("Total Test Cost (in bits per sequence):", test_cost) print("Example of Input/Output") print("prediction:", prediction[0]) print("Input:", input[0]) if total_examples / checkpoint_interval >= total_batches: break
def trainValidateSegmentation(args): ''' Main function for trainign and validation :param args: global arguments :return: None ''' # load the model cuda_available = torch.cuda.is_available() num_gpus = torch.cuda.device_count() model = net.EESPNet_Seg(args.classes, s=args.s, pretrained=args.pretrained, gpus=num_gpus) if num_gpus >= 1: model = torch.nn.DataParallel(model) args.savedir = args.savedir + str(args.s) + '/' # create the directory if not exist if not os.path.exists(args.savedir): os.mkdir(args.savedir) # check if processed data file exists or not if not os.path.isfile(args.cached_data_file): dataLoad = ld.LoadData(args.data_dir, args.classes, args.cached_data_file) data = dataLoad.processData() if data is None: print('Error while pickling data. Please check.') exit(-1) else: data = pickle.load(open(args.cached_data_file, "rb")) if cuda_available: args.onGPU = True model = model.cuda() total_paramters = netParams(model) print('Total network parameters: ' + str(total_paramters)) # define optimization criteria weight = torch.from_numpy( data['classWeights']) # convert the numpy array to torch if args.onGPU: weight = weight.cuda() criteria = torch.nn.CrossEntropyLoss(weight) #weight if args.onGPU: criteria = criteria.cuda() print('Data statistics') print(data['mean'], data['std']) print(data['classWeights']) #compose the data with transforms trainDataset_main = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.RandomCropResize(size=(args.inWidth, args.inHeight)), myTransforms.RandomFlip(), #myTransforms.RandomCrop(64). myTransforms.ToTensor(args.scaleIn), # ]) trainDataset_scale1 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.RandomCropResize(size=(int(args.inWidth * 1.5), int(1.5 * args.inHeight))), myTransforms.RandomFlip(), myTransforms.ToTensor(args.scaleIn), # ]) trainDataset_scale2 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.RandomCropResize(size=(int(args.inWidth * 1.25), int(1.25 * args.inHeight))), # 1536, 768 myTransforms.RandomFlip(), myTransforms.ToTensor(args.scaleIn), # ]) trainDataset_scale3 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.RandomCropResize(size=(int(args.inWidth * 0.75), int(0.75 * args.inHeight))), myTransforms.RandomFlip(), myTransforms.ToTensor(args.scaleIn), # ]) trainDataset_scale4 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.RandomCropResize(size=(int(args.inWidth * 0.5), int(0.5 * args.inHeight))), myTransforms.RandomFlip(), myTransforms.ToTensor(args.scaleIn), # ]) valDataset = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(1024, 512), myTransforms.ToTensor(args.scaleIn), # ]) # since we training from scratch, we create data loaders at different scales # so that we can generate more augmented data and prevent the network from overfitting trainLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset( data['trainIm'], data['trainAnnot'], transform=trainDataset_main), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) trainLoader_scale1 = torch.utils.data.DataLoader( myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale1), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) trainLoader_scale2 = torch.utils.data.DataLoader( myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale2), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) trainLoader_scale3 = torch.utils.data.DataLoader( myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale3), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) trainLoader_scale4 = torch.utils.data.DataLoader( myDataLoader.MyDataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale4), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) valLoader = torch.utils.data.DataLoader(myDataLoader.MyDataset( data['valIm'], data['valAnnot'], transform=valDataset), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) if args.onGPU: cudnn.benchmark = True start_epoch = 0 best_val = 0 lr = args.lr optimizer = torch.optim.Adam(model.parameters(), lr, (0.9, 0.999), eps=1e-08, weight_decay=5e-4) # we step the loss by 2 after step size is reached #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_loss, gamma=0.5) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_val = checkpoint['best_val'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) logFileLoc = args.savedir + args.logFile if os.path.isfile(logFileLoc): logger = open(logFileLoc, 'a') else: logger = open(logFileLoc, 'w') logger.write("Parameters: %s" % (str(total_paramters))) logger.write( "\n%s\t%s\t%s\t%s\t%s\t" % ('Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val')) logger.flush() for epoch in range(start_epoch, args.max_epochs): #scheduler.step(epoch) poly_lr_scheduler(args, optimizer, epoch) lr = 0 for param_group in optimizer.param_groups: lr = param_group['lr'] print("Learning rate: " + str(lr)) # train for one epoch # We consider 1 epoch with all the training data (at different scales) train(args, trainLoader_scale1, model, criteria, optimizer, epoch) train(args, trainLoader_scale2, model, criteria, optimizer, epoch) train(args, trainLoader_scale4, model, criteria, optimizer, epoch) train(args, trainLoader_scale3, model, criteria, optimizer, epoch) lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr = train( args, trainLoader, model, criteria, optimizer, epoch) # evaluate on validation set lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = val( args, valLoader, model, criteria) is_best = mIOU_val > best_val save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr': lr, 'best_val': best_val, }, args.savedir + 'checkpoint.pth.tar') #save the model also if is_best: model_file_name = args.savedir + os.sep + 'model_best.pth' torch.save(model.state_dict(), model_file_name) with open(args.savedir + 'acc_' + str(epoch) + '.txt', 'w') as log: log.write( "\nEpoch: %d\t Overall Acc (Tr): %.4f\t Overall Acc (Val): %.4f\t mIOU (Tr): %.4f\t mIOU (Val): %.4f" % (epoch, overall_acc_tr, overall_acc_val, mIOU_tr, mIOU_val)) log.write('\n') log.write('Per Class Training Acc: ' + str(per_class_acc_tr)) log.write('\n') log.write('Per Class Validation Acc: ' + str(per_class_acc_val)) log.write('\n') log.write('Per Class Training mIOU: ' + str(per_class_iu_tr)) log.write('\n') log.write('Per Class Validation mIOU: ' + str(per_class_iu_val)) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val, lr)) logger.flush() print("Epoch : " + str(epoch) + ' Details') print( "\nEpoch No.: %d\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f" % (epoch, lossTr, lossVal, mIOU_tr, mIOU_val)) logger.close()
def train_scg(config, tols, milestone, device='cpu'): lr_d = config['lr_d'] lr_g = config['lr_g'] optim_type = config['optimizer'] z_dim = config['z_dim'] model_name = config['model'] epoch_num = config['epoch_num'] show_iter = config['show_iter'] loss_name = config['loss_type'] l2_penalty = config['d_penalty'] logdir = config['logdir'] start_n = config['startn'] dataset = get_data(dataname=config['dataset'], path='../datas/%s' % config['datapath']) dataloader = DataLoader(dataset=dataset, batch_size=config['batchsize'], shuffle=True, num_workers=4) inner_loader = DataLoader(dataset=dataset, batch_size=config['batchsize'], shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) optimizer = SCG(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], dataloader=inner_loader, device=device, solver='cg') scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if config['checkpoint'] is not None: startPoint = config['checkpoint'] chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) gpu_num = config['gpu_num'] if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: optimizer.zero_grad() real_x = real_x[0] if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': z = torch.randn((real_x.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((real_x.shape[0], z_dim), device=device) def closure(train_x): train_x = train_x.to(device) fake_x = G(z) d_fake = D(fake_x) d_real = D(train_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) return loss loss = optimizer.step(closure=closure, img=real_x) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (config['dataset'], logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optimizer) count += 1
def main(): args = get_args() use_cuda = (not args.no_cuda) and torch.cuda.is_available() kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") # adjust batch size based on the number of gpus available args.batch_size = int(torch.cuda.device_count()) * args.batch_size_per_gpu # log and create snapshots os.makedirs(args.log_dir, exist_ok=True) filenames_to_snapshot = glob("*.py") + glob("*.sh") utils.snapshot_files(filenames_to_snapshot, args.log_dir) logger = utils.get_logger(log_dir=args.log_dir) with open(os.path.join(args.log_dir, "params.json"), 'w') as fh: json.dump(args.__dict__, fh, indent=2) logger.info("%s", repr(args)) # tensorboard writer writer = SummaryWriter(log_dir=os.path.join(args.log_dir, 'tensorboard')) # random seed for reproducability torch.manual_seed(args.seed) np.random.seed(args.seed) # create dataloaders trainset = loader.RB2DataLoader( data_dir=args.data_folder, data_filename=args.train_data, nx=args.nx, nz=args.nz, nt=args.nt, n_samp_pts_per_crop=args.n_samp_pts_per_crop, downsamp_xz=args.downsamp_xz, downsamp_t=args.downsamp_t, normalize_output=args.normalize_channels, return_hres=False, lres_filter=args.lres_filter, lres_interp=args.lres_interp) evalset = loader.RB2DataLoader( data_dir=args.data_folder, data_filename=args.eval_data, nx=args.nx, nz=args.nz, nt=args.nt, n_samp_pts_per_crop=args.n_samp_pts_per_crop, downsamp_xz=args.downsamp_xz, downsamp_t=args.downsamp_t, normalize_output=args.normalize_channels, return_hres=True, lres_filter=args.lres_filter, lres_interp=args.lres_interp) train_sampler = RandomSampler(trainset, replacement=True, num_samples=args.pseudo_epoch_size) eval_sampler = RandomSampler(evalset, replacement=True, num_samples=args.num_log_images) train_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=False, drop_last=True, sampler=train_sampler, **kwargs) eval_loader = DataLoader(evalset, batch_size=args.batch_size, shuffle=False, drop_last=False, sampler=eval_sampler, **kwargs) # setup model unet = UNet3d(in_features=4, out_features=args.lat_dims, igres=trainset.scale_lres, nf=args.unet_nf, mf=args.unet_mf) imnet = ImNet(dim=3, in_features=args.lat_dims, out_features=4, nf=args.imnet_nf, activation=NONLINEARITIES[args.nonlin]) all_model_params = list(unet.parameters()) + list(imnet.parameters()) if args.optim == "sgd": optimizer = optim.SGD(all_model_params, lr=args.lr) else: optimizer = optim.Adam(all_model_params, lr=args.lr) start_ep = 0 global_step = np.zeros(1, dtype=np.uint32) tracked_stats = np.inf if args.resume: resume_dict = torch.load(args.resume) start_ep = resume_dict["epoch"] global_step = resume_dict["global_step"] tracked_stats = resume_dict["tracked_stats"] unet.load_state_dict(resume_dict["unet_state_dict"]) imnet.load_state_dict(resume_dict["imnet_state_dict"]) optimizer.load_state_dict(resume_dict["optim_state_dict"]) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) unet = nn.DataParallel(unet) unet.to(device) imnet = nn.DataParallel(imnet) imnet.to(device) model_param_count = lambda model: sum(x.numel() for x in model.parameters()) logger.info("{}(unet) + {}(imnet) paramerters in total".format( model_param_count(unet), model_param_count(imnet))) checkpoint_path = os.path.join(args.log_dir, "checkpoint_latest.pth.tar") # get pdelayer for the RB2 equations if args.normalize_channels: mean = trainset.channel_mean std = trainset.channel_std else: mean = std = None pde_layer = get_rb2_pde_layer(mean=mean, std=std, t_crop=args.nt * 0.125, z_crop=args.nz * (1. / 128), x_crop=args.nx * (1. / 128), prandtl=args.prandtl, rayleigh=args.rayleigh, use_continuity=args.use_continuity) if args.lr_scheduler: scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') # training loop for epoch in range(start_ep + 1, args.epochs + 1): loss = train(args, unet, imnet, train_loader, epoch, global_step, device, logger, writer, optimizer, pde_layer) eval(args, unet, imnet, eval_loader, epoch, global_step, device, logger, writer, optimizer, pde_layer) if args.lr_scheduler: scheduler.step(loss) if loss < tracked_stats: tracked_stats = loss is_best = True else: is_best = False utils.save_checkpoint( { "epoch": epoch, "unet_state_dict": unet.module.state_dict(), "imnet_state_dict": imnet.module.state_dict(), "optim_state_dict": optimizer.state_dict(), "tracked_stats": tracked_stats, "global_step": global_step, }, is_best, epoch, checkpoint_path, "_pdenet", logger)
def train(epoch_num=10, milestone=None, optim_type='Adam', lr_d=1e-4, lr_g=1e-4, startPoint=None, start_n=0, z_dim=128, batchsize=64, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='cifar10', device='cpu', gpu_num=1, saturating=False): dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s' % (logdir, current_time)) d_optimizer = Adam(D.parameters(), lr=lr_d, betas=(0.5, 0.999)) g_optimizer = Adam(G.parameters(), lr=lr_g, betas=(0.5, 0.999)) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) d_optimizer.load_state_dict(chk['d_optim']) g_optimizer.load_state_dict(chk['g_optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) d_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) d_optimizer.zero_grad() g_optimizer.zero_grad() d_loss.backward() d_optimizer.step() if not saturating: if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) g_loss = get_loss(name=loss_name, g_loss=True, d_fake=d_fake) g_optimizer.zero_grad() g_loss.backward() else: g_loss = d_loss g_optimizer.step() writer.add_scalar('Loss/D loss', d_loss.item(), count) writer.add_scalar('Loss/G loss', g_loss.item(), count) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter %d, D Loss: %.5f, G loss: %.5f, time: %.2f s' % (count, d_loss.item(), g_loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s_%d.pth' % (optim_type, model_name, count + start_n), D=D, G=G, optimizer=d_optimizer, g_optimizer=g_optimizer) count += 1 writer.close()
def main_worker(gpu, ngpus_per_node, args): global best_acc1 if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # Define training directory in case number of classes is required by the model instance main_file = args.root / args.main_file num_classes = len( [cur_dir.name for cur_dir in main_file.iterdir() if len(list(cur_dir.iterdir())) >= args.min_allowed_imgs] ) if not num_classes == 1000: print('[INFO]: Using {} classes instead of 1000 ImageNet classes'.format(num_classes)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.model)) model = models.__dict__[args.model](num_classes=num_classes) if args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer if args.loss_func in ['cross', 'cross_entropy', 'entropy']: criterion = nn.CrossEntropyLoss().cuda(args.gpu) elif args.loss_func in ['l2', 'l2_squared', 'squared', 'MSE']: print('[INFO] Using MSE loss function instead of Cross Entropy.') args.loss_func = 'l2' criterion = nn.MSELoss().cuda(args.gpu) if args.opt.lower() == 'sgd': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.opt.lower() == 'adam': print('[INFO] Using Adam optimizer instead of SGD.') optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) elif args.opt.lower() == 'lbfgs': print('[INFO] Using LBFGS optimizer instead of SGD.') optimizer = torch.optim.LBFGS(model.parameters(), args.lr, history_size=20 ) else: raise ValueError('Incorrect optimizer selection {}'.format(args.opt)) if args.initial_lr: param_setup = [{'params': cur_lay.parameters()} for i, cur_lay in enumerate(model) if 'weight' in dir(cur_lay)] optimizer = torch.optim.SGD(param_setup, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.schedule_lr: scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, args.lr / 100, args.lr) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code test_file = args.root / args.test_file if args.sub_file: sub_file = args.root / args.sub_file normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_trans_list = [] if not args.norandomcrop: train_trans_list.append(transforms.RandomResizedCrop(224)) if not args.norandomflip: train_trans_list.append(transforms.RandomHorizontalFlip()) train_trans_list = train_trans_list + [transforms.ToTensor(), normalize] train_dataset = datasets.ImageFolder( main_file, transforms.Compose(train_trans_list) ) test_dataset = datasets.ImageFolder(test_file, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), train=False) if args.sub_file: sub_dataset = datasets.ImageFolder(test_file, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), train=False) if args.train_size or args.select_class_list: if not args.select_class_list: args.select_class_list = list(range(args.num_classes)) sel_idx = [] for lbl in args.select_class_list: lbl_idx = [i for i, t in enumerate(train_dataset.targets) if t == lbl] sel_idx += random.sample(lbl_idx, (args.train_size if args.train_size else len(lbl_idx))) train_dataset.samples = train_dataset.samples[sel_idx] train_dataset.targets = train_dataset.targets[sel_idx] for cur_idx, cur_cls in enumerate(args.select_class_list): train_dataset.targets[train_dataset.targets==cur_cls] = cur_idx sel_idx = [] for lbl in args.select_class_list: lbl_idx = [i for i, t in enumerate(test_dataset.targets) if t == lbl] sel_idx += lbl_idx test_dataset.samples = test_dataset.samples[sel_idx] test_dataset.targets = test_dataset.targets[sel_idx] for cur_idx, cur_cls in enumerate(args.select_class_list): test_dataset.targets[test_dataset.targets==cur_cls] = cur_idx # Inject symmetric noise to training set if args.inject_noise: im_per_class = int(len(train_dataset) / args.num_classes) noisy_labels = np.zeros((len(train_dataset),), dtype=int) num_shuffle = int(im_per_class * args.inject_noise) for i in range(args.num_classes): noisy_idx = [] cur_idx = [idx for idx, label in enumerate(train_dataset.targets) if label==i] shuffled_idx = random.sample(cur_idx, len(cur_idx)) for r in range(args.num_classes): noisy_idx += [r for idx in shuffled_idx[im_per_class - (r+1)*num_shuffle:im_per_class - r*num_shuffle]] noisy_idx += [i for idx in shuffled_idx[:im_per_class - args.num_classes*num_shuffle]] noisy_labels[cur_idx] = np.array(noisy_idx) train_dataset.targets = noisy_labels # TODO: Replace fraction of one training set randomly with another. if args.mix_cifar: assert args.mix_rate, "mix_rate should be given when mix_cifar is set" assert args.traindir2, "traindir2 must be given when mix_cifar is set" assert not args.inject_noise, "inject_noise should not be given when mix_cifar is set" assert not args.testdir2, "only one testdir can be set when mix_cifar is set" traindir2 = os.path.join(args.root, args.traindir2) clean_dataset = datasets.ImageFolder( traindir2, transforms.Compose([ transforms.ToTensor(), normalize, ])) im_per_class = int(len(train_dataset) / len(train_dataset.classes)) num_shuffle = int(im_per_class * args.mix_rate) shuffled_samples = [] clean_samples = [] for i in range(len(train_dataset.classes)): cur_imgs = [s[0] for s in train_dataset.samples if s[1]==i] cur_imgs = random.sample(cur_imgs, im_per_class - num_shuffle) mix_imgs = [s[0] for s in clean_dataset.samples if s[1]==i] mix_imgs = random.sample(mix_imgs, num_shuffle) clean_samples += [(img, i) for img in mix_imgs] shuffled_samples += [(img, i) for img in cur_imgs + mix_imgs] train_dataset.samples = shuffled_samples clean_dataset.samples = clean_samples val_loader2 = torch.utils.data.DataLoader( clean_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.sub_file: val_loader2 = torch.utils.data.DataLoader( sub_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return if args.compute_jacobian: gvec = (torch.randn((1, args.num_classes)) / len(train_dataset)).cuda(args.gpu, non_blocking=True) # TODO: tracking weights of the model if args.track_weights: layer_idx = [i for i, cl in enumerate(model) if 'weight' in dir(cl)] cur_weights = get_weights(model, layer_idx) if args.track_weights == 'filters': filter_w_file = args.outpath / 'filter_weights.pickle' filter_w_dict = {('layer_'+str(l)): [] for i, l in enumerate(layer_idx) if cur_weights[i].ndim > 2} if args.track_weights == 'norm': w_norm_dict = {('layer_'+str(l)): 0 for i, l in enumerate(layer_idx) if cur_weights[i].ndim > 1} # TODO: scaling the weights of the model manually if args.scale_weights: scale_dict = {} for cur_l, cur_w in enumerate(cur_weights): if not (cur_w.ndim > 2): continue scale_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item() rescale_weights(model, scale_dict) save_config(args) train_log = [] log_file = args.outpath / 'log.json' for epoch in range(args.start_epoch, args.epochs): if (epoch < args.max_lr_adjusting_epoch) and (not args.schedule_lr): adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) epoch_log = {'epoch': epoch} # update learning rate with scheduler if args.schedule_lr: scheduler.step() # evaluate on validation set dum_acc1, dum_acc5 = validate(train_loader, model, criterion, args) epoch_log.update({'train': {'acc1': dum_acc1.cpu().numpy().item(), 'acc5': dum_acc5.cpu().numpy().item()}}) acc1, acc5 = validate(val_loader, model, criterion, args) epoch_log.update({'test': {'acc1': acc1.cpu().numpy().item(), 'acc5': acc5.cpu().numpy().item()}}) if args.sub_file or args.mix_cifar: dum_acc1, dum_acc5 = validate(val_loader2, model, criterion, args) epoch_log.update({'subset': {'acc1': dum_acc1.cpu().numpy().item(), 'acc5': dum_acc5.cpu().numpy().item()}}) # compute the jacobian of the network if args.compute_jacobian: jTg = get_jacobian_prod(train_loader, model, criterion, gvec, args) epoch_log.update({'J_norm': {str(k): v.item() for k, v in enumerate(jTg)}}) # TODO: tracking the weights of the layers if args.track_weights: w_change_dict = {('layer_'+str(l)): 0 for l in layer_idx} new_weights = get_weights(model, layer_idx) if args.track_weights == 'norm': for cur_l, cur_w in enumerate(new_weights): if not (cur_w.ndim > 1): continue w_norm_dict['layer_' + str(layer_idx[cur_l])] = np.linalg.norm(cur_w.flatten()).item() epoch_log.update({'w_norm': {k: v for k, v in w_norm_dict.items()}}) else: for cur_l in range(len(layer_idx)): cur_change = new_weights[cur_l] - cur_weights[cur_l] if args.track_weights == 'filters': if cur_change.ndim > 2: cur_change = np.mean(cur_change, axis=(2,3)) filter_w_dict['layer_' + str(layer_idx[cur_l])].append(np.absolute(cur_change)) chng = np.absolute(np.mean(cur_change)) w_change_dict['layer_' + str(layer_idx[cur_l])] = chng.item() epoch_log.update({'weight_change': {k: v for k, v in w_change_dict.items()}}) if args.track_weights == 'filters': with open(filter_w_file, 'wb') as fn: pickle.dump({k: np.stack(v) for k, v in filter_w_dict.items()}, fn) cur_weights = [wh for wh in new_weights] new_weight = None train_log.append(epoch_log) with open(log_file, 'w') as fn: json.dump(train_log, fn, indent=2) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main(args): set_seed(args) save_dir = os.path.join(args.CHK_DIR, args.LOG_DIR, args.train_id) log_path = os.path.join('runs/', args.LOG_DIR, args.train_id) os.makedirs(save_dir, exist_ok=True) os.makedirs(log_path, exist_ok=True) ## save argparse parameters with open(os.path.join(log_path, args.train_id + '_args.yaml'), 'w') as f: for k, v in args.__dict__.items(): f.write('{}: {}\n'.format(k, v)) # writer = SummaryWriter(os.path.join('runs/',args.LOG_DIR, args.train_id)) writer = SummaryWriter(log_path) train_loader = load_dataloaer(args) model = Net() model = try_gpu(model) model.train() criterion = Loss() optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) init_epoch = 0 ## init time zero_time = time.time() for epoch in range(init_epoch, args.epochs): start_time = time.time() # train_loss = train_per_epoch(train_loader, model, criterion, optimizer, epoch) avg_loss = 0 for cnt, (img, target) in enumerate(train_loader, 1): print(cnt, img.shape, target.shape) img, target = try_gpu(img), try_gpu(target) pred = model(img) loss = criterion(pred, target) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() if (cnt % 10 == 0): # writer.add_scalar('training loss', loss.item(), epoch+cnt/(len(train_loader)//args.batch_size)) writer.add_scalar( 'training loss', loss.item(), epoch * (len(train_loader) // args.batch_size) + cnt) # if(cnt%100==0): if (cnt % 5000 == 0): cp_file = os.path.join( save_dir, 'epoch_' + str(epoch) + '_itr_' + str(cnt)) + '.pt' save_checkpoint( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, cp_file) avg_loss /= len(train_loader) end_time = time.time() epoch_time = end_time - start_time total_time = end_time - zero_time # writer.add_scalar('training loss', train_loss, epoch) # total_train_loss.append(train_loss) total_train_loss.append(avg_loss) writer.close()
def perform_training(params): ''' Attempts to train the remaining number of epochs. Will fail if no valid model is loaded. Keyword arguments: params > params (dict) -- currently loaded state dict. Returns: N/A ''' if params['model'] is None: print( 'No model loaded! Type -n to create a new model, or -l to load an existing one from file.\n' ) return # Delete downstream checkpoints (i.e. those with greater epoch numbers) # for consistency in saved checkpoints if not train_utils.delete_future_checkpoints(params): return setup_cuda(params) print('\n--- COMMENCE TRAINING ---\n') classifier_state = None if params['is_generator'] and params['adversarial_train']: print( '\nAdversarially training a VAE. Please load a classifier model.') classifier_state = load_model(param_factory(False)) setup_cuda(classifier_state) # Training/val loop for epoch in range(params['cur_epoch'], params['total_epochs'] + 1): print('--- TRAINING: begin epoch', epoch, '---') # LR Decay - currently a stepwise decay adjust_learning_rate(epoch, params) # Train for one epoch train_one_epoch(epoch, params, classifier_state=classifier_state) print('--- TRAINING: end epoch', epoch, '---') if params['evaluate']: # Evaluate on validation set acc1 = validate(params, save=True, adversarial=False) if params['adversarial_train'] and not params['is_generator']: # TODO: MAKE VALIDATE ACTUALLY SAVE PROPERLY FOR ADVERSARIAL VALIDATION ad_acc1 = validate(params, save=False, adversarial=True, adversarial_attack='FGSM', whitebox=True) # Update best val accuracy if not params['is_generator']: if params['adversarial_train']: params['best_ad_val_acc'] = max(ad_acc1, params['best_ad_val_acc']) params['best_val_acc'] = max(acc1, params['best_val_acc']) # Update the current epoch params['cur_epoch'] += 1 # Save checkpoint every 'save_every' epochs. # N.B. params['cur_epoch'] is always the epoch we would START # training at. The epoch name in the save file is the number of # epochs we have FINISHED training (in other words, # params['cur_epoch'] == (named epoch) + 1). if epoch % params['save_every'] == 0: train_utils.save_checkpoint(params, epoch) if params['total_epochs'] % params['save_every'] != 0: train_utils.save_checkpoint(params, params['total_epochs']) print('\n--- END TRAINING ---\n')
def train_mnist(epoch_num=10, show_iter=100, logdir='test', model_weight=None, load_d=False, load_g=False, compare_path=None, info_time=100, run_select=None, dataname='CIFAR10', data_path='None', device='cpu'): lr_d = 0.01 lr_g = 0.01 batchsize = 128 z_dim = 96 print('MNIST, discriminator lr: %.3f, generator lr: %.3f' % (lr_d, lr_g)) dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) if compare_path is not None: discriminator = dc_D().to(device) model_weight = torch.load(compare_path) discriminator.load_state_dict(model_weight['D']) model_vec = torch.cat( [p.contiguous().view(-1) for p in discriminator.parameters()]) print('Load discriminator from %s' % compare_path) if run_select is not None: fixed_data = torch.load(run_select) real_set = fixed_data['real_set'] fake_set = fixed_data['fake_set'] real_d = fixed_data['real_d'] fake_d = fixed_data['fake_d'] fixed_vec = fixed_data['pred_vec'] print('load fixed data set') d_optimizer = SGD(D.parameters(), lr=lr_d) g_optimizer = SGD(G.parameters(), lr=lr_g) timer = time.time() count = 0 fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) fake_x_c = fake_x.clone().detach() # update generator d_fake = D(fake_x) # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(), # 'Real image': d_real.mean().item()}, # global_step=count) G_loss = get_loss(name='JSD', g_loss=True, d_fake=d_fake) g_optimizer.zero_grad() G_loss.backward() g_optimizer.step() gg = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in G.parameters()]), p=2) d_fake_c = D(fake_x_c) D_loss = get_loss(name='JSD', g_loss=False, d_real=d_real, d_fake=d_fake_c) if compare_path is not None and count % info_time == 0: diff = get_diff(net=D, model_vec=model_vec) # writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count) if run_select is not None: with torch.no_grad(): d_real_set = D(real_set) d_fake_set = D(fake_set) diff_real = torch.norm(d_real_set - real_d, p=2) diff_fake = torch.norm(d_fake_set - fake_d, p=2) d_vec = torch.cat([d_real_set, d_fake_set]) diff = torch.norm(d_vec.sub_(fixed_vec), p=2) # writer.add_scalars('L2 norm of pred difference', # {'Total': diff.item(), # 'real set': diff_real.item(), # 'fake set': diff_fake.item()}, # global_step=count) d_optimizer.zero_grad() D_loss.backward() d_optimizer.step() gd = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in D.parameters()]), p=2) # writer.add_scalars('Loss', {'D_loss': D_loss.item(), # 'G_loss': G_loss.item()}, global_step=count) # writer.add_scalars('Grad', {'D grad': gd.item(), # 'G grad': gg.item()}, global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s/' % logdir if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % count, normalize=True) save_checkpoint(path=logdir, name='SGD-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1
def train_g(epoch_num=10, logdir='test', loss_name='JSD', show_iter=500, model_weight=None, load_d=False, load_g=False, device='cpu'): lr_d = 0.01 lr_g = 0.01 batchsize = 128 z_dim = 96 print('MNIST, discriminator lr: %.3f' % lr_d) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') # writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_g)) d_optimizer = SGD(D.parameters(), lr=lr_d) g_optimizer = SGD(G.parameters(), lr=lr_g) timer = time.time() count = 0 for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((real_x.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) D_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) G_loss = get_loss(name=loss_name, g_loss=True, d_real=d_real, d_fake=d_fake) d_optimizer.zero_grad() g_optimizer.zero_grad() G_loss.backward() g_optimizer.step() print('D_loss: {}, G_loss: {}'.format(D_loss.item(), G_loss.item())) # writer.add_scalars('Loss', {'D_loss': D_loss.item(), # 'G_loss': G_loss.item()}, # global_step=count) # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(), # 'Real image': d_real.mean().item()}, # global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() save_checkpoint(path=logdir, name='FixD-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1
def forward(self, num_updates, data_queue, data_event, process_event, tb=None, log_interval=100, checkpoint_interval=10000): temp_grads = None while (True): data_event.wait() data = data_queue.get() dist.barrier(async_op=True) if self.process_id == 0: original_state_dict = {} data_event.clear() if self.process_id == 0 and self.num_iter != 0 and self.num_iter % checkpoint_interval == 0: save_checkpoint(0, self.model, self.optimizer, suffix=str(self.num_iter)) # broadcast weights from master process to all others and save them to a detached dictionary for loadinglater for k, v in self.model.state_dict().items(): if self.process_id == 0: # and self.forward_passes == 0: original_state_dict[k] = v.clone().detach() # v.to(self.device) dist.broadcast(v, src=0, async_op=True) self.model.to(self.device) self.model.train() # meta gradients support_x, support_y, query_x, query_y = map( lambda x: torch.LongTensor(x).to(self.device), data) for i in range(num_updates): self.meta_optimizer.zero_grad() pred_logits = self.model(input_ids=support_x, decoder_input_ids=support_y[:, :-1]) pred_logits = pred_logits.contiguous().view( -1, pred_logits.size(2)) loss, n_correct = self.compute_mle_loss(pred_logits, support_y[:, 1:], smoothing=True) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) self.meta_optimizer.step() pred_logits = self.model(input_ids=query_x, decoder_input_ids=query_y[:, :-1]) pred_logits = pred_logits.contiguous().view( -1, pred_logits.size(2)) loss, n_correct = self.compute_mle_loss(pred_logits, query_y[:, 1:], smoothing=True) non_pad_mask = query_y[:1:].ne(PAD_IDX) n_word = non_pad_mask.sum().item() acc = torch.FloatTensor([n_correct / n_word]).to(self.device) # loss, pred = self.model(query_x, query_y) all_grads = autograd.grad(loss, self.model.parameters()) dist.reduce(loss, 0, op=dist.ReduceOp.SUM, async_op=True) dist.reduce(acc, 0, op=dist.ReduceOp.SUM) for idx in range(len(all_grads)): dist.reduce(all_grads[idx].data, 0, op=dist.ReduceOp.SUM, async_op=True) all_grads[idx].data = (all_grads[idx].data / self.world_size) if self.process_id == 0 and tb is not None and self.num_iter % log_interval == 0: tb_mle_meta_batch(tb, loss.item() / self.world_size, acc / self.world_size, self.num_iter) if self.process_id == 0: # if self.forward_passes == 0: # temp_cop # temp_grads = list(deepcopy(all_grads)) # else: # for i in range(len(temp_grads)): # temp_grads[i] += all_grads[i] # if self.forward_passes == self.total_forward: # temp_grads[i].data = temp_grads[i].data/self.total_forward self.num_iter += 1 # self.forward_passes += 1 # if self.forward_passes == self.total_forward: # self.forward_passes = 0 self._write_grads(original_state_dict, temp_grads, (query_x, query_y)) # print("finished emtamaffdhd") # else: # self.model.load_state_dict(self.original_state_dict) # print("doing forward pass") # self.model.to(self.device) # finished batch so can load data again from master process_event.set()