def train(args, dataloader, model): epoch = 1 optimizer = optim.Adam(list(model.parameters()), lr=args.lr) scheduler = MultiStepLR(optimizer, milestones=LR_milestones, gamma=args.lr) model.train() for epoch in range(5000): for batch_idx, data in enumerate(dataloader): model.zero_grad() features = data['features'].float() adj_input = data['adj'].float() features = Variable(features).cuda() adj_input = Variable(adj_input).cuda() loss = model(features, adj_input) print('Epoch: ', epoch, ', Iter: ', batch_idx, ', Loss: ', loss) loss.backward() optimizer.step() scheduler.step() break
def main(): parser = argparse.ArgumentParser( description= 'Large-scale Point Cloud Semantic Segmentation with Superpoint Graphs') # Optimization arguments parser.add_argument('--wd', default=0, type=float, help='Weight decay') parser.add_argument('--lr', default=1e-2, type=float, help='Initial learning rate') parser.add_argument( '--lr_decay', default=0.7, type=float, help='Multiplicative factor used on learning rate at `lr_steps`') parser.add_argument( '--lr_steps', default='[]', help='List of epochs where the learning rate is decreased by `lr_decay`' ) parser.add_argument('--momentum', default=0.9, type=float, help='Momentum') parser.add_argument( '--epochs', default=10, type=int, help='Number of epochs to train. If <=0, only testing will be done.') parser.add_argument('--batch_size', default=2, type=int, help='Batch size') parser.add_argument('--optim', default='adam', help='Optimizer: sgd|adam') parser.add_argument( '--grad_clip', default=1, type=float, help='Element-wise clipping of gradient. If 0, does not clip') parser.add_argument( '--loss_weights', default='none', help='[none, proportional, sqrt] how to weight the loss function') # Learning process arguments parser.add_argument('--cuda', default=1, type=int, help='Bool, use cuda') parser.add_argument( '--nworkers', default=0, type=int, help= 'Num subprocesses to use for data loading. 0 means that the data will be loaded in the main process' ) parser.add_argument('--test_nth_epoch', default=1, type=int, help='Test each n-th epoch during training') parser.add_argument('--save_nth_epoch', default=1, type=int, help='Save model each n-th epoch during training') parser.add_argument( '--test_multisamp_n', default=10, type=int, help='Average logits obtained over runs with different seeds') # Dataset parser.add_argument('--dataset', default='sema3d', help='Dataset name: sema3d|s3dis') parser.add_argument( '--cvfold', default=0, type=int, help='Fold left-out for testing in leave-one-out setting (S3DIS)') parser.add_argument('--odir', default='results', help='Directory to store results') parser.add_argument('--resume', default='', help='Loads a previously saved model.') parser.add_argument('--db_train_name', default='train') parser.add_argument('--db_test_name', default='test') parser.add_argument('--use_val_set', type=int, default=0) parser.add_argument('--SEMA3D_PATH', default='datasets/semantic3d') parser.add_argument('--S3DIS_PATH', default='datasets/s3dis') parser.add_argument('--VKITTI_PATH', default='datasets/vkitti') parser.add_argument('--CUSTOM_SET_PATH', default='datasets/custom_set') parser.add_argument( '--use_pyg', default=0, type=int, help='Wether to use Pytorch Geometric for graph convolutions') # Model parser.add_argument( '--model_config', default='gru_10,f_8', help= 'Defines the model as a sequence of layers, see graphnet.py for definitions of respective layers and acceptable arguments. In short: rectype_repeats_mv_layernorm_ingate_concat, with rectype the type of recurrent unit [gru/crf/lstm], repeats the number of message passing iterations, mv (default True) the use of matrix-vector (mv) instead vector-vector (vv) edge filters, layernorm (default True) the use of layernorms in the recurrent units, ingate (default True) the use of input gating, concat (default True) the use of state concatenation' ) parser.add_argument('--seed', default=1, type=int, help='Seed for random initialisation') parser.add_argument( '--edge_attribs', default= 'delta_avg,delta_std,nlength/ld,surface/ld,volume/ld,size/ld,xyz/d', help= 'Edge attribute definition, see spg_edge_features() in spg.py for definitions.' ) # Point cloud processing parser.add_argument( '--pc_attribs', default='xyzrgbelpsvXYZ', help= 'Point attributes fed to PointNets, if empty then all possible. xyz = coordinates, rgb = color, e = elevation, lpsv = geometric feature, d = distance to center' ) parser.add_argument( '--pc_augm_scale', default=0, type=float, help= 'Training augmentation: Uniformly random scaling in [1/scale, scale]') parser.add_argument( '--pc_augm_rot', default=1, type=int, help='Training augmentation: Bool, random rotation around z-axis') parser.add_argument( '--pc_augm_mirror_prob', default=0, type=float, help='Training augmentation: Probability of mirroring about x or y axes' ) parser.add_argument( '--pc_augm_jitter', default=1, type=int, help='Training augmentation: Bool, Gaussian jittering of all attributes' ) parser.add_argument( '--pc_xyznormalize', default=1, type=int, help='Bool, normalize xyz into unit ball, i.e. in [-0.5,0.5]') # Filter generating network parser.add_argument( '--fnet_widths', default='[32,128,64]', help= 'List of width of hidden filter gen net layers (excluding the input and output ones, they are automatic)' ) parser.add_argument( '--fnet_llbias', default=0, type=int, help='Bool, use bias in the last layer in filter gen net') parser.add_argument( '--fnet_orthoinit', default=1, type=int, help='Bool, use orthogonal weight initialization for filter gen net.') parser.add_argument( '--fnet_bnidx', default=2, type=int, help='Layer index to insert batchnorm to. -1=do not insert.') parser.add_argument( '--edge_mem_limit', default=30000, type=int, help= 'Number of edges to process in parallel during computation, a low number can reduce memory peaks.' ) # Superpoint graph parser.add_argument( '--spg_attribs01', default=1, type=int, help='Bool, normalize edge features to 0 mean 1 deviation') parser.add_argument('--spg_augm_nneigh', default=100, type=int, help='Number of neighborhoods to sample in SPG') parser.add_argument('--spg_augm_order', default=3, type=int, help='Order of neighborhoods to sample in SPG') parser.add_argument( '--spg_augm_hardcutoff', default=512, type=int, help= 'Maximum number of superpoints larger than args.ptn_minpts to sample in SPG' ) parser.add_argument( '--spg_superedge_cutoff', default=-1, type=float, help= 'Artificially constrained maximum length of superedge, -1=do not constrain' ) # Point net parser.add_argument( '--ptn_minpts', default=40, type=int, help= 'Minimum number of points in a superpoint for computing its embedding.' ) parser.add_argument('--ptn_npts', default=128, type=int, help='Number of input points for PointNet.') parser.add_argument('--ptn_widths', default='[[64,64,128,128,256], [256,64,32]]', help='PointNet widths') parser.add_argument('--ptn_widths_stn', default='[[64,64,128], [128,64]]', help='PointNet\'s Transformer widths') parser.add_argument( '--ptn_nfeat_stn', default=11, type=int, help='PointNet\'s Transformer number of input features') parser.add_argument('--ptn_prelast_do', default=0, type=float) parser.add_argument( '--ptn_mem_monger', default=1, type=int, help= 'Bool, save GPU memory by recomputing PointNets in back propagation.') # Decoder parser.add_argument( '--sp_decoder_config', default="[]", type=str, help= 'Size of the decoder : sp_embedding -> sp_class. First layer of size sp_embed (* (1+n_ecc_iteration) if concatenation) and last layer is n_classes' ) args = parser.parse_args() args.start_epoch = 0 args.lr_steps = ast.literal_eval(args.lr_steps) args.fnet_widths = ast.literal_eval(args.fnet_widths) args.ptn_widths = ast.literal_eval(args.ptn_widths) args.sp_decoder_config = ast.literal_eval(args.sp_decoder_config) args.ptn_widths_stn = ast.literal_eval(args.ptn_widths_stn) print('Will save to ' + args.odir) if not os.path.exists(args.odir): os.makedirs(args.odir) with open(os.path.join(args.odir, 'cmdline.txt'), 'w') as f: f.write(" ".join([ "'" + a + "'" if (len(a) == 0 or a[0] != '-') else a for a in sys.argv ])) set_seed(args.seed, args.cuda) logging.getLogger().setLevel( logging.INFO) #set to logging.DEBUG to allow for more prints if (args.dataset == 'sema3d' and args.db_test_name.startswith('test')) or ( args.dataset.startswith('s3dis_02') and args.cvfold == 2): # needed in pytorch 0.2 for super-large graphs with batchnorm in fnet (https://github.com/pytorch/pytorch/pull/2919) torch.backends.cudnn.enabled = False if args.use_pyg: torch.backends.cudnn.enabled = False # Decide on the dataset if args.dataset == 'sema3d': import sema3d_dataset dbinfo = sema3d_dataset.get_info(args) create_dataset = sema3d_dataset.get_datasets elif args.dataset == 's3dis': import s3dis_dataset dbinfo = s3dis_dataset.get_info(args) create_dataset = s3dis_dataset.get_datasets elif args.dataset == 'vkitti': import vkitti_dataset dbinfo = vkitti_dataset.get_info(args) create_dataset = vkitti_dataset.get_datasets elif args.dataset == 'custom_dataset': import custom_dataset #<- to write! dbinfo = custom_dataset.get_info(args) create_dataset = custom_dataset.get_datasets else: raise NotImplementedError('Unknown dataset ' + args.dataset) # Create model and optimizer if args.resume != '': if args.resume == 'RESUME': args.resume = args.odir + '/model.pth.tar' model, optimizer, stats = resume(args, dbinfo) else: model = create_model(args, dbinfo) optimizer = create_optimizer(args, model) stats = [] train_dataset, test_dataset, valid_dataset, scaler = create_dataset(args) print( 'Train dataset: %i elements - Test dataset: %i elements - Validation dataset: %i elements' % (len(train_dataset), len(test_dataset), len(valid_dataset))) ptnCloudEmbedder = pointnet.CloudEmbedder(args) scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_decay, last_epoch=args.start_epoch - 1) ############ def train(): """ Trains for one epoch """ model.train() loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=spg.eccpc_collate, num_workers=args.nworkers, shuffle=True, drop_last=True) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=65) loss_meter = tnt.meter.AverageValueMeter() acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) t0 = time.time() # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): t_loader = 1000 * (time.time() - t0) model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 2:], targets[:, 1:].sum( 1 ) if args.cuda: label_mode, label_vec, segm_size = label_mode_cpu.cuda( ), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda() else: label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float( ), segm_size_cpu.float() optimizer.zero_grad() t0 = time.time() embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) loss = nn.functional.cross_entropy(outputs, Variable(label_mode), weight=dbinfo["class_weights"]) loss.backward() ptnCloudEmbedder.bw_hook() if args.grad_clip > 0: for p in model.parameters(): p.grad.data.clamp_(-args.grad_clip, args.grad_clip) optimizer.step() t_trainer = 1000 * (time.time() - t0) #loss_meter.add(loss.data[0]) # pytorch 0.3 loss_meter.add(loss.item()) # pytorch 0.4 o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) logging.debug( 'Batch loss %f, Loader time %f ms, Trainer time %f ms.', loss.data.item(), t_loader, t_trainer) t0 = time.time() return acc_meter.value()[0], loss_meter.value( )[0], confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_average_intersection_union() ############ def eval(is_valid=False): """ Evaluated model on test set """ model.eval() if is_valid: #validation loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) else: #evaluation loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=65) acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) loss_meter = tnt.meter.AverageValueMeter() confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 2:], targets[:, 1:].sum( 1 ).float( ) if args.cuda: label_mode, label_vec, segm_size = label_mode_cpu.cuda( ), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda() else: label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float( ), segm_size_cpu.float() embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) loss = nn.functional.cross_entropy(outputs, Variable(label_mode), weight=dbinfo["class_weights"]) loss_meter.add(loss.item()) o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy()) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) return meter_value(acc_meter), loss_meter.value( )[0], confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_average_intersection_union( ), confusion_matrix.get_mean_class_accuracy() ############ def eval_final(): """ Evaluated model on test set in an extended way: computes estimates over multiple samples of point clouds and stores predictions """ model.eval() acc_meter = tnt.meter.ClassErrorMeter(accuracy=True) confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes']) collected, predictions = defaultdict(list), {} # collect predictions over multiple sampling seeds for ss in range(args.test_multisamp_n): test_dataset_ss = create_dataset(args, ss)[1] loader = torch.utils.data.DataLoader(test_dataset_ss, batch_size=1, collate_fn=spg.eccpc_collate, num_workers=args.nworkers) if logging.getLogger().getEffectiveLevel() > logging.DEBUG: loader = tqdm(loader, ncols=65) # iterate over dataset in batches for bidx, (targets, GIs, clouds_data) in enumerate(loader): model.ecc.set_info(GIs, args.cuda) label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:, 0], targets[:, 2:], targets[:, 1:].sum( 1 ).float( ) embeddings = ptnCloudEmbedder.run(model, *clouds_data) outputs = model.ecc(embeddings) fname = clouds_data[0][0][:clouds_data[0][0].rfind('.')] collected[fname].append( (outputs.data.cpu().numpy(), label_mode_cpu.numpy(), label_vec_cpu.numpy())) # aggregate predictions (mean) for fname, lst in collected.items(): o_cpu, t_cpu, tvec_cpu = list(zip(*lst)) if args.test_multisamp_n > 1: o_cpu = np.mean(np.stack(o_cpu, 0), 0) else: o_cpu = o_cpu[0] t_cpu, tvec_cpu = t_cpu[0], tvec_cpu[0] predictions[fname] = np.argmax(o_cpu, 1) o_cpu, t_cpu, tvec_cpu = filter_valid(o_cpu, t_cpu, tvec_cpu) if t_cpu.size > 0: acc_meter.add(o_cpu, t_cpu) confusion_matrix.count_predicted_batch(tvec_cpu, np.argmax(o_cpu, 1)) per_class_iou = {} perclsiou = confusion_matrix.get_intersection_union_per_class() for c, name in dbinfo['inv_class_map'].items(): per_class_iou[name] = perclsiou[c] return meter_value(acc_meter), confusion_matrix.get_overall_accuracy( ), confusion_matrix.get_average_intersection_union( ), per_class_iou, predictions, confusion_matrix.get_mean_class_accuracy( ), confusion_matrix.confusion_matrix ############ # Training loop try: best_iou = stats[-1]['best_iou'] except: best_iou = 0 TRAIN_COLOR = '\033[0m' VAL_COLOR = '\033[0;94m' TEST_COLOR = '\033[0;93m' BEST_COLOR = '\033[0;92m' epoch = args.start_epoch for epoch in range(args.start_epoch, args.epochs): print('Epoch {}/{} ({}):'.format(epoch, args.epochs, args.odir)) scheduler.step() acc, loss, oacc, avg_iou = train() print(TRAIN_COLOR + '-> Train Loss: %1.4f Train accuracy: %3.2f%%' % (loss, acc)) new_best_model = False if args.use_val_set: acc_val, loss_val, oacc_val, avg_iou_val, avg_acc_val = eval(True) print(VAL_COLOR + '-> Val Loss: %1.4f Val accuracy: %3.2f%% Val oAcc: %3.2f%% Val IoU: %3.2f%% best ioU: %3.2f%%' % \ (loss_val, acc_val, 100*oacc_val, 100*avg_iou_val,100*max(best_iou,avg_iou_val)) + TRAIN_COLOR) if avg_iou_val > best_iou: #best score yet on the validation set print(BEST_COLOR + '-> New best model achieved!' + TRAIN_COLOR) best_iou = avg_iou_val new_best_model = True torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scaler': scaler }, os.path.join(args.odir, 'model.pth.tar')) elif epoch % args.save_nth_epoch == 0 or epoch == args.epochs - 1: torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scaler': scaler }, os.path.join(args.odir, 'model.pth.tar')) #test every test_nth_epochs #or test after each enw model (but skip the first 5 for efficiency) if (not(args.use_val_set) and (epoch+1) % args.test_nth_epoch == 0) \ or (args.use_val_set and new_best_model and epoch > 5): acc_test, loss_test, oacc_test, avg_iou_test, avg_acc_test = eval( False) print(TEST_COLOR + '-> Test Loss: %1.4f Test accuracy: %3.2f%% Test oAcc: %3.2f%% Test avgIoU: %3.2f%%' % \ (loss_test, acc_test, 100*oacc_test, 100*avg_iou_test) + TRAIN_COLOR) else: acc_test, loss_test, oacc_test, avg_iou_test, avg_acc_test = 0, 0, 0, 0, 0 stats.append({ 'epoch': epoch, 'acc': acc, 'loss': loss, 'oacc': oacc, 'avg_iou': avg_iou, 'acc_test': acc_test, 'oacc_test': oacc_test, 'avg_iou_test': avg_iou_test, 'avg_acc_test': avg_acc_test, 'best_iou': best_iou }) """ if epoch % args.save_nth_epoch == 0 or epoch==args.epochs-1: with open(os.path.join(args.odir, 'trainlog.json'), 'w') as outfile: json.dump(stats, outfile,indent=4) torch.save({'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), 'scaler': scaler}, os.path.join(args.odir, 'model.pth.tar')) """ if math.isnan(loss): break if len(stats) > 0: with open(os.path.join(args.odir, 'trainlog.json'), 'w') as outfile: json.dump(stats, outfile, indent=4) if args.use_val_set: args.resume = args.odir + '/model.pth.tar' model, optimizer, stats = resume(args, dbinfo) torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(args.odir, 'model.pth.tar')) # Final evaluation if args.test_multisamp_n > 0 and 'test' in args.db_test_name: acc_test, oacc_test, avg_iou_test, per_class_iou_test, predictions_test, avg_acc_test, confusion_matrix = eval_final( ) print( '-> Multisample {}: Test accuracy: {}, \tTest oAcc: {}, \tTest avgIoU: {}, \tTest mAcc: {}' .format(args.test_multisamp_n, acc_test, oacc_test, avg_iou_test, avg_acc_test)) with h5py.File( os.path.join(args.odir, 'predictions_' + args.db_test_name + '.h5'), 'w') as hf: for fname, o_cpu in predictions_test.items(): hf.create_dataset(name=fname, data=o_cpu) #(0-based classes) with open( os.path.join(args.odir, 'scores_' + args.db_test_name + '.json'), 'w') as outfile: json.dump([{ 'epoch': args.start_epoch, 'acc_test': acc_test, 'oacc_test': oacc_test, 'avg_iou_test': avg_iou_test, 'per_class_iou_test': per_class_iou_test, 'avg_acc_test': avg_acc_test }], outfile) np.save(os.path.join(args.odir, 'pointwise_cm.npy'), confusion_matrix)
def training_benchmark(arg, milestones): logging.basicConfig(filename=arg.log_path, level=logging.INFO) # log file logging.info('Started') if not os.path.exists(arg.model_path): os.makedirs(arg.model_path) model = VDN_NET(in_channels=arg.channels, depth_snet=arg.snet) model = model.float() clipping = bool(arguments.clipping) # Load training data obj_data = gd.TrainBenchmark(h5_file_=arg.train_data, patch_size=arg.patch, window=11, radius=5) if torch.cuda.is_available(): model.cuda() torch.backends.cudnn.benchmark = True data = DataLoader(obj_data, batch_size=arg.batch, shuffle=True, num_workers=arg.workers, pin_memory=True) else: data = DataLoader(obj_data, batch_size=arg.batch, shuffle=True) # network parameters epsilon = np.sqrt(1.0e-6) p_window = 7 if clipping: gadient_clip_Dnet = 1000.0 gadient_clip_Snet = 50.0 Dnet_parameters = [ x for name, x in model.named_parameters() if 'dnet' in name.lower() ] Snet_parameters = [ x for name, x in model.named_parameters() if 'snet' in name.lower() ] optimizer = optim.Adam(model.parameters(), lr=2e-4) scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=arg.gamma) print("Training model Benchmark now!") for epoch in range(arg.epochs): tic = time.time() if clipping: grad_D = 0.0 grad_S = 0.0 epoch_avg_loss = 0.0 mse_avg = 0.0 psnr_avg = 0.0 ssim_avg = 0.0 lr = optimizer.param_groups[0]['lr'] if lr < arg.learning: print("reach min learning rate at epoch" + str(epoch)) model.train() for i, batch_data in enumerate(data): if torch.cuda.is_available(): y_batch, x_batch, sigma_arr = Variable( batch_data[0]).cuda(), Variable( batch_data[1]).cuda(), Variable(batch_data[2]).cuda() else: y_batch, x_batch, sigma_arr = batch_data[0], batch_data[ 1], batch_data[2] optimizer.zero_grad() out_D, out_s = model(y_batch) loss, loglikelihood, kl_z, kl_sigma = loss_func.get_loss( x_batch, y_batch, sigma_arr, p_window, out_D[:, :arg.channels, :, :], out_D[:, arg.channels:, :, :], out_s[:, :arg.channels, :, :], out_s[:, arg.channels:, :, :], epsilon) loss.backward() if clipping: full_grad_D = nn.utils.clip_grad_norm_(Dnet_parameters, gadient_clip_Dnet) full_grad_S = nn.utils.clip_grad_norm_(Snet_parameters, gadient_clip_Snet) grad_D = (grad_D * (i / (i + 1)) + full_grad_D / (i + 1)) grad_S = (grad_S * (i / (i + 1)) + full_grad_S / (i + 1)) optimizer.step() epoch_avg_loss += loss.detach().item() predicted_image = y_batch - out_D[:, :arg.channels, :, :].detach( ).data predicted_image = predicted_image.clamp(0, 1) mse = calc_MSE(predicted_image, x_batch) mse_avg += mse psnr_avg += psnr(predicted_image * 255, x_batch * 255) ssim_avg += calculate_ssim(img_as_ubyte( predicted_image.permute(2, 3, 1, 0).cpu().numpy()), img_as_ubyte( x_batch.permute(2, 3, 1, 0).cpu().numpy()), multichannel=True) if i == 0: print("First ForwardPAss\n Loss: {}, MSE: {}".format( loss.detach().item(), mse)) if (i + 1) % 100 == 0: print("{} - Loss: {}, MSE:{}, epoch:{}".format( i + 1, loss.item(), mse, epoch + 1)) if i >= 5000: break if clipping: gadient_clip_Dnet = min(gadient_clip_Dnet, grad_D) gadient_clip_Dnet = min(gadient_clip_Dnet, grad_S) print("----------------------------------------------------------") print( "Epoch: {}, Avg MSE:{}, Avg Epoch Loss:{}, Avg PSNR:{}, Avg SSIM : {}, LR:{}" .format(epoch + 1, mse_avg / (i + 1), epoch_avg_loss / (i + 1), psnr_avg / (i + 1), ssim_avg / (i + 1), lr)) logging.info("av loss: {}, epoch: {}".format(epoch_avg_loss / (i + 1), epoch + 1)) # --------------- here comes the validation! --------------- model.eval() avg_psnr_validation = 0.0 avg_ssim_validation = 0.0 obj_data = gd.ValidationBenchmark(h5_file_=arg.val_data) if torch.cuda.is_available(): model.cuda() torch.backends.cudnn.benchmark = True for idx in range(obj_data.__len__()): noisy, image = obj_data.__getitem__(idx) ch, ht, wt = noisy.shape noisy = noisy.view(1, ch, ht, wt).cuda() image = image.cuda() model_out, _ = model(noisy) noise = noisy - model_out[:, :ch, ].detach().data clean_img_pred = noise.view(ch, ht, wt).permute(1, 2, 0).clamp(0, 1) image = image.view(ch, ht, wt).permute(1, 2, 0) avg_psnr_validation += psnr(image * 255, clean_img_pred * 255) avg_ssim_validation += compare_ssim( img_as_ubyte(image.cpu().numpy()), img_as_ubyte(clean_img_pred.cpu().numpy()), win_size=11, data_range=255, multichannel=True, gaussian_weights=True) print("average validation PSNR = ", avg_psnr_validation / obj_data.__len__()) print("average validation SSIM = ", avg_ssim_validation / obj_data.__len__()) # -------------- finish validation --------------------------------- scheduler.step() toc = time.time() print('Time for this epoch: {:.2f}'.format(toc - tic)) if epoch % arguments.epoch_save == 0: torch.save( model.state_dict(), os.path.join(arg.model_path, "model_" + str(epoch) + "_epochs.pth")) print("saved model as" + arg.model_path) print("Finished Training...\n Saving model now.....\n") torch.save(model.state_dict(), os.path.join(arg.model_path, "final_model.pth")) print("saved model as" + os.path.join(arg.model_path, "final_model.pth"))
# model = models.__dict__[opt.MODEL.ARCH]() if opt.MODEL.ARCH.startswith('densenet'): assert (opt.MODEL.INPUT_SIZE % 32 == 0) model.avgpool = nn.AvgPool2d(opt.MODEL.INPUT_SIZE // 32, stride=1) #model.avgpool = nn.AdaptiveAvgPool2d(1) model.classifier = nn.Linear(model.classifier.in_features, DATA_INFO.NUM_CLASSES) model = torch.nn.DataParallel(model).cuda() else: raise NotImplementedError model = torch.nn.DataParallel(model).cuda() optimizer = optim.Adam(model.module.parameters(), opt.TRAIN.LEARNING_RATE) lr_scheduler = MultiStepLR(optimizer, opt.TRAIN.LR_MILESTONES, gamma=opt.TRAIN.LR_GAMMA, last_epoch=-1) if opt.TRAIN.RESUME is None: last_epoch = 0 logger.info("Training will start from Epoch {}".format(last_epoch + 1)) else: last_checkpoint = torch.load(opt.TRAIN.RESUME) assert (last_checkpoint['arch'] == opt.MODEL.ARCH) model.module.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info("Checkpoint '{}' was loaded.".format(opt.TRAIN.RESUME)) last_epoch = last_checkpoint['epoch'] logger.info("Training will be resumed from Epoch {}".format(
'params': model.features.parameters(), 'lr': 1e-4 * 10 }, { 'params': model.classifier.parameters(), 'lr': 1e-4 }] else: for param in model.embedding.parameters(): param.requires_grad = False for param in model.features.parameters(): param.requires_grad = False optim_configs = [{'params': model.classifier.parameters(), 'lr': 1e-4}] optimizer = Adam(optim_configs, lr=1e-4) lr_scheduler = MultiStepLR( optimizer, milestones=[int(NUM_EPOCHS * 0.5), int(NUM_EPOCHS * 0.7)], gamma=0.1) print( "# trainable parameters:", sum(param.numel() if param.requires_grad else 0 for param in model.parameters())) # record statistics results = { 'train_loss': [], 'train_accuracy': [], 'test_loss': [], 'test_accuracy': [] } # record current best test accuracy
net = ResNet50() if USE_CUDA: net.cuda() devices = [] for i in range(args.devices): devices.append(i) if len(devices)>1: net = torch.nn.DataParallel(net, device_ids=devices) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[args.epochs*.25, args.epochs*.5,args.epochs*.75], gamma=0.1) def train(epoch): print('\nEpoch: %d' % epoch) global iter_count epoch_start_time = time.time() scheduler.step() net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): iter_count += 1
def train(y_train, X_train, y_val, X_val, ld, frq, beta, alpha=1.0, rho=0.9, loss_name='L0'): LOSS = { 'L0': LossFunc(False, False), 'L1': LossFunc(True, False), # Pernalty2 'L2': LossFunc(False, True), # Pernalty1 'L3': LossFunc(True, True) # Pernalty1 + 2 } Loss = LOSS[loss_name] Weight = Variable(torch.FloatTensor(0.5 * np.ones(beta.shape)), requires_grad=True) frq = Variable(torch.Tensor(frq)) ld = Variable(torch.Tensor(ld)) batch_size = 50 train_dataset = AssocDataSet(X=X_train, y=y_train) train_loader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=1) val_X = Variable(torch.Tensor(X_val), requires_grad=False) val_y = Variable(torch.Tensor(y_val), requires_grad=False) opt = torch.optim.Adam([Weight], lr=0.02) scheduler = MultiStepLR(opt, milestones=([x * 5 for x in range(1, 25)] + [200, 300, 400]), gamma=0.83) epoch_iterator = tqdm(range(101)) for epoch in epoch_iterator: epoch_losses = [] for cur_X, cur_y in train_loader: opt.zero_grad() cur_X = Variable(cur_X, requires_grad=False) cur_y = Variable(cur_y, requires_grad=False) loss = Loss(cur_X, cur_y, Weight, alpha=alpha, rho=rho, gamma=frq, tau=ld) epoch_losses.append(loss.data[0]) loss.backward() opt.step() scheduler.step() val_loss = Loss(val_X, val_y, Weight, alpha=alpha, rho=rho, gamma=frq, tau=ld).data[0] status = 'Ephch[{}]: loss: {}, val: {}; rho: {}; alpha: {}'.format( epoch, np.mean(epoch_losses), val_loss, rho, alpha) epoch_iterator.set_description(status) weight_name = '{}_rho_{}_alpha_{}.npy'.format(loss_name, str(rho)[:3], str(alpha)[:3]) weight_dir = os.path.join('weight', weight_name) weight_file = os.path.abspath(os.path.expanduser(weight_dir)) weight = Weight.data.numpy() np.save(weight_file, weight) return val_loss
def _main( meta_dir: str, save_prefix: str = '', model_name: str = 'refine_unet_base', # or refine_spectrogram_unet save_dir: str = 'savedir', batch_size: int = 128, num_workers: int = 16, fix_len: float = 2., lr: float = 5e-4, beta1: float = 0.5, beta2: float = 0.9, weight_decay: float = 0.0, max_step: int = 100000, valid_max_step: int = 30, save_interval: int = 1000, log_interval: int = 100, grad_clip: float = 0.0, grad_norm: float = 30.0, milestones: Tuple[int] = None, gamma: float = 0.2, is_augment: bool = True, is_dsd: bool = False, # model args hidden_dim: int = 768, filter_len: int = 512, hop_len: int = 64, block_layers: int = 4, layers: int = 4, kernel_size: int = 3, norm: str = 'ins', act: str = 'comp', refine_layers: int = 1, ): betas = beta1, beta2 # setup model args model_args = { 'hidden_dim': hidden_dim, 'filter_len': filter_len, 'hop_len': hop_len, 'spec_dim': filter_len // 2 + 1, 'block_layers': block_layers, 'layers': layers, 'kernel_size': kernel_size, 'norm': norm, 'refine_layers': refine_layers, 'act': act } # create model model = build_model(model_name, extra_kwargs=model_args).cuda() # multi-gpu if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # create optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) if milestones: milestones = [int(x) for x in list(milestones)] scheduler = MultiStepLR(optimizer, milestones, gamma=gamma) else: scheduler = None # adopt dsd100 case if is_dsd: sr = 44100 if is_augment: dataset_func = get_datasets meta_cls = DSD100Meta else: dataset_func = dsd100.get_datasets else: sr = 22050 # load dataset if is_augment: dataset_func = get_datasets meta_cls = VoiceBankMeta else: dataset_func = voice_bank.get_datasets train_loader, valid_loader = dataset_func(meta_dir, batch_size=batch_size, num_workers=num_workers, meta_cls=meta_cls, fix_len=int(fix_len * sr), audio_mask=True) # train loss = Wave2WaveTrainer(model, optimizer, train_loader, valid_loader, max_step=max_step, valid_max_step=min(valid_max_step, len(valid_loader)), save_interval=save_interval, log_interval=log_interval, save_dir=save_dir, save_prefix=save_prefix, grad_clip=grad_clip, grad_norm=grad_norm, pretrained_path='', scheduler=scheduler, sr=sr).run() return { 'loss': loss, 'status': 'ok', }
batch_size=batch_size, collate_fn=training.collate_pil) ''' for i,(x,y) in enumerate(loader): mtcnn(x, save_path=y) print('\rBatch {} of {}'.format(i+1, len(loader)), end='') del mtcnn ''' resnet = InceptionResnetV1(classify=True, pretrained='vggface2', num_classes=len(dataset.class_to_idx)).to(device) optimizer = optim.Adam(resnet.parameters(), lr=0.001) scheduler = MultiStepLR(optimizer, [5, 10]) trans = transforms.Compose( [np.float32, transforms.ToTensor(), fixed_image_standardization]) dataset = datasets.ImageFolder(data_dir + 'cropped', transform=trans) img_inds = np.arange(len(dataset)) np.random.shuffle(img_inds) train_inds = img_inds[:int(0.8 * len(img_inds))] val_inds = img_inds[int(0.8 * len(img_inds)):] train_loader = DataLoader(dataset, num_workers=workers, batch_size=batch_size, sampler=SubsetRandomSampler(train_inds)) val_loader = DataLoader(dataset,
def main_worker(gpu, ngpus_per_node, cfg): cfg['GPU'] = gpu if gpu != 0: def print_pass(*args): pass builtins.print = print_pass cfg['RANK'] = cfg['RANK'] * ngpus_per_node + gpu dist.init_process_group(backend=cfg['DIST_BACKEND'], init_method=cfg["DIST_URL"], world_size=cfg['WORLD_SIZE'], rank=cfg['RANK']) # Data loading code batch_size = int(cfg['BATCH_SIZE']) per_batch_size = int(batch_size / ngpus_per_node) #workers = int((cfg['NUM_WORKERS'] + ngpus_per_node - 1) / ngpus_per_node) # dataload threads workers = int(cfg['NUM_WORKERS']) DATA_ROOT = cfg[ 'DATA_ROOT'] # the parent root where your train/val/test data are stored VAL_DATA_ROOT = cfg['VAL_DATA_ROOT'] RECORD_DIR = cfg['RECORD_DIR'] RGB_MEAN = cfg['RGB_MEAN'] # for normalize inputs RGB_STD = cfg['RGB_STD'] DROP_LAST = cfg['DROP_LAST'] LR_SCHEDULER = cfg['LR_SCHEDULER'] LR_STEP_SIZE = cfg['LR_STEP_SIZE'] LR_DECAY_EPOCH = cfg['LR_DECAY_EPOCH'] LR_DECAT_GAMMA = cfg['LR_DECAT_GAMMA'] LR_END = cfg['LR_END'] WARMUP_EPOCH = cfg['WARMUP_EPOCH'] WARMUP_LR = cfg['WARMUP_LR'] NUM_EPOCH = cfg['NUM_EPOCH'] USE_APEX = cfg['USE_APEX'] EVAL_FREQ = cfg['EVAL_FREQ'] SYNC_BN = cfg['SYNC_BN'] print("=" * 60) print("Overall Configurations:") print(cfg) print("=" * 60) transform_list = [ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ] if cfg['RANDOM_ERASING']: transform_list.append(RandomErasing()) if cfg['CUTOUT']: transform_list.append(Cutout()) train_transform = transforms.Compose(transform_list) if cfg['RANDAUGMENT']: train_transform.transforms.insert( 0, RandAugment(n=cfg['RANDAUGMENT_N'], m=cfg['RANDAUGMENT_M'])) dataset_train = FaceDataset(DATA_ROOT, RECORD_DIR, train_transform) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset_train) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=per_batch_size, shuffle=(train_sampler is None), num_workers=workers, pin_memory=True, sampler=train_sampler, drop_last=DROP_LAST) SAMPLE_NUMS = dataset_train.get_sample_num_of_each_class() NUM_CLASS = len(train_loader.dataset.classes) print("Number of Training Classes: {}".format(NUM_CLASS)) lfw, cfp_fp, agedb_30, vgg2_fp, lfw_issame, cfp_fp_issame, agedb_30_issame, vgg2_fp_issame = get_val_data( VAL_DATA_ROOT) #======= model & loss & optimizer =======# BACKBONE_DICT = { 'MobileFaceNet': MobileFaceNet, 'ResNet_50': ResNet_50, 'ResNet_101': ResNet_101, 'ResNet_152': ResNet_152, 'IR_50': IR_50, 'IR_100': IR_100, 'IR_101': IR_101, 'IR_152': IR_152, 'IR_185': IR_185, 'IR_200': IR_200, 'IR_SE_50': IR_SE_50, 'IR_SE_100': IR_SE_100, 'IR_SE_101': IR_SE_101, 'IR_SE_152': IR_SE_152, 'IR_SE_185': IR_SE_185, 'IR_SE_200': IR_SE_200, 'AttentionNet_IR_56': AttentionNet_IR_56, 'AttentionNet_IRSE_56': AttentionNet_IRSE_56, 'AttentionNet_IR_92': AttentionNet_IR_92, 'AttentionNet_IRSE_92': AttentionNet_IRSE_92, 'PolyNet': PolyNet, 'PolyFace': PolyFace, 'EfficientPolyFace': EfficientPolyFace, 'ResNeSt_50': resnest50, 'ResNeSt_101': resnest101, 'ResNeSt_100': resnest100, 'GhostNet': GhostNet, 'MobileNetV3': MobileNetV3, 'ProxylessNAS': proxylessnas } #'HRNet_W30': HRNet_W30, 'HRNet_W32': HRNet_W32, 'HRNet_W40': HRNet_W40, 'HRNet_W44': HRNet_W44, 'HRNet_W48': HRNet_W48, 'HRNet_W64': HRNet_W64 BACKBONE_NAME = cfg['BACKBONE_NAME'] INPUT_SIZE = cfg['INPUT_SIZE'] assert INPUT_SIZE == [112, 112] backbone = BACKBONE_DICT[BACKBONE_NAME](INPUT_SIZE) print("=" * 60) print(backbone) print("{} Backbone Generated".format(BACKBONE_NAME)) print("=" * 60) HEAD_DICT = { 'Softmax': Softmax, 'ArcFace': ArcFace, 'Combined': Combined, 'CosFace': CosFace, 'SphereFace': SphereFace, 'Am_softmax': Am_softmax, 'CurricularFace': CurricularFace, 'ArcNegFace': ArcNegFace, 'SVX': SVXSoftmax, 'AirFace': AirFace, 'QAMFace': QAMFace, 'CircleLoss': CircleLoss } HEAD_NAME = cfg['HEAD_NAME'] EMBEDDING_SIZE = cfg['EMBEDDING_SIZE'] # feature dimension head = HEAD_DICT[HEAD_NAME](in_features=EMBEDDING_SIZE, out_features=NUM_CLASS) print("Params: ", count_model_params(backbone)) print("Flops:", count_model_flops(backbone)) #backbone = backbone.eval() #print("Flops: ", flops_to_string(2*float(profile_macs(backbone.eval(), torch.randn(1, 3, 112, 112))))) #backbone = backbone.train() print("=" * 60) print(head) print("{} Head Generated".format(HEAD_NAME)) print("=" * 60) #--------------------optimizer----------------------------- if BACKBONE_NAME.find("IR") >= 0: backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras( backbone ) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability else: backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras( backbone ) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability LR = cfg['LR'] # initial LR WEIGHT_DECAY = cfg['WEIGHT_DECAY'] MOMENTUM = cfg['MOMENTUM'] optimizer = optim.SGD( [{ 'params': backbone_paras_wo_bn + list(head.parameters()), 'weight_decay': WEIGHT_DECAY }, { 'params': backbone_paras_only_bn }], lr=LR, momentum=MOMENTUM) if LR_SCHEDULER == 'step': scheduler = StepLR(optimizer, step_size=LR_STEP_SIZE, gamma=LR_DECAT_GAMMA) elif LR_SCHEDULER == 'multi_step': scheduler = MultiStepLR(optimizer, milestones=LR_DECAY_EPOCH, gamma=LR_DECAT_GAMMA) elif LR_SCHEDULER == 'cosine': scheduler = CosineWarmupLR(optimizer, batches=len(train_loader), epochs=NUM_EPOCH, base_lr=LR, target_lr=LR_END, warmup_epochs=WARMUP_EPOCH, warmup_lr=WARMUP_LR) print("=" * 60) print(optimizer) print("Optimizer Generated") print("=" * 60) # loss LOSS_NAME = cfg['LOSS_NAME'] LOSS_DICT = { 'Softmax': nn.CrossEntropyLoss(), 'LabelSmooth': LabelSmoothCrossEntropyLoss(classes=NUM_CLASS), 'Focal': FocalLoss(), 'HM': HardMining(), 'Softplus': nn.Softplus() } loss = LOSS_DICT[LOSS_NAME].cuda(gpu) print("=" * 60) print(loss) print("{} Loss Generated".format(loss)) print("=" * 60) torch.cuda.set_device(cfg['GPU']) backbone.cuda(cfg['GPU']) head.cuda(cfg['GPU']) #optionally resume from a checkpoint BACKBONE_RESUME_ROOT = cfg[ 'BACKBONE_RESUME_ROOT'] # the root to resume training from a saved checkpoint HEAD_RESUME_ROOT = cfg[ 'HEAD_RESUME_ROOT'] # the root to resume training from a saved checkpoint IS_RESUME = cfg['IS_RESUME'] if IS_RESUME: print("=" * 60) if os.path.isfile(BACKBONE_RESUME_ROOT): print("Loading Backbone Checkpoint '{}'".format( BACKBONE_RESUME_ROOT)) loc = 'cuda:{}'.format(cfg['GPU']) backbone.load_state_dict( torch.load(BACKBONE_RESUME_ROOT, map_location=loc)) if os.path.isfile(HEAD_RESUME_ROOT): print("Loading Head Checkpoint '{}'".format(HEAD_RESUME_ROOT)) checkpoint = torch.load(HEAD_RESUME_ROOT, map_location=loc) cfg['START_EPOCH'] = checkpoint['EPOCH'] head.load_state_dict(checkpoint['HEAD']) optimizer.load_state_dict(checkpoint['OPTIMIZER']) del (checkpoint) else: print( "No Checkpoint Found at '{}' and '{}'. Please Have a Check or Continue to Train from Scratch" .format(BACKBONE_RESUME_ROOT, HEAD_RESUME_ROOT)) print("=" * 60) ori_backbone = copy.deepcopy(backbone) if SYNC_BN: backbone = apex.parallel.convert_syncbn_model(backbone) if USE_APEX: [backbone, head], optimizer = amp.initialize([backbone, head], optimizer, opt_level='O2') backbone = DDP(backbone) head = DDP(head) else: backbone = torch.nn.parallel.DistributedDataParallel( backbone, device_ids=[cfg['GPU']]) head = torch.nn.parallel.DistributedDataParallel( head, device_ids=[cfg['GPU']]) # checkpoint and tensorboard dir MODEL_ROOT = cfg['MODEL_ROOT'] # the root to buffer your checkpoints LOG_ROOT = cfg['LOG_ROOT'] # the root to log your train/val status os.makedirs(MODEL_ROOT, exist_ok=True) os.makedirs(LOG_ROOT, exist_ok=True) writer = SummaryWriter( LOG_ROOT) # writer for buffering intermedium results # train for epoch in range(cfg['START_EPOCH'], cfg['NUM_EPOCH']): train_sampler.set_epoch(epoch) if LR_SCHEDULER != 'cosine': scheduler.step() #train for one epoch DISP_FREQ = 100 # 100 batch batch = 0 # batch index backbone.train() # set to training mode head.train() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() for inputs, labels in tqdm(iter(train_loader)): if LR_SCHEDULER == 'cosine': scheduler.step() # compute output start_time = time.time() inputs = inputs.cuda(cfg['GPU'], non_blocking=True) labels = labels.cuda(cfg['GPU'], non_blocking=True) if cfg['MIXUP']: inputs, labels_a, labels_b, lam = mixup_data( inputs, labels, cfg['GPU'], cfg['MIXUP_PROB'], cfg['MIXUP_ALPHA']) inputs, labels_a, labels_b = map(Variable, (inputs, labels_a, labels_b)) elif cfg['CUTMIX']: inputs, labels_a, labels_b, lam = cutmix_data( inputs, labels, cfg['GPU'], cfg['CUTMIX_PROB'], cfg['MIXUP_ALPHA']) inputs, labels_a, labels_b = map(Variable, (inputs, labels_a, labels_b)) features = backbone(inputs) outputs = head(features, labels) if cfg['MIXUP'] or cfg['CUTMIX']: lossx = mixup_criterion(loss, outputs, labels_a, labels_b, lam) else: lossx = loss(outputs, labels) if HEAD_NAME != 'CircleLoss' else loss( outputs).mean() end_time = time.time() duration = end_time - start_time if ((batch + 1) % DISP_FREQ == 0) and batch != 0: print("batch inference time", duration) # compute gradient and do SGD step optimizer.zero_grad() if USE_APEX: with amp.scale_loss(lossx, optimizer) as scaled_loss: scaled_loss.backward() else: lossx.backward() optimizer.step() # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, labels, topk=( 1, 5)) if HEAD_NAME != 'CircleLoss' else accuracy( features.data, labels, topk=(1, 5)) losses.update(lossx.data.item(), inputs.size(0)) top1.update(prec1.data.item(), inputs.size(0)) top5.update(prec5.data.item(), inputs.size(0)) # dispaly training loss & acc every DISP_FREQ if ((batch + 1) % DISP_FREQ == 0) or batch == 0: print("=" * 60) print('Epoch {}/{} Batch {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, cfg['NUM_EPOCH'], batch + 1, len(train_loader), loss=losses, top1=top1, top5=top5)) print("=" * 60) # perform validation & save checkpoints per epoch # validation statistics per epoch (buffer for visualization) if (batch + 1) % EVAL_FREQ == 0: #lr = scheduler.get_last_lr() lr = optimizer.param_groups[0]['lr'] print("Current lr", lr) print("=" * 60) print( "Perform Evaluation on LFW, CFP_FP, AgeD and VGG2_FP, and Save Checkpoints..." ) accuracy_lfw, best_threshold_lfw, roc_curve_lfw = perform_val( EMBEDDING_SIZE, per_batch_size, backbone, lfw, lfw_issame) buffer_val(writer, "LFW", accuracy_lfw, best_threshold_lfw, roc_curve_lfw, epoch + 1) accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp = perform_val( EMBEDDING_SIZE, per_batch_size, backbone, cfp_fp, cfp_fp_issame) buffer_val(writer, "CFP_FP", accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp, epoch + 1) accuracy_agedb_30, best_threshold_agedb_30, roc_curve_agedb_30 = perform_val( EMBEDDING_SIZE, per_batch_size, backbone, agedb_30, agedb_30_issame) buffer_val(writer, "AgeDB", accuracy_agedb_30, best_threshold_agedb_30, roc_curve_agedb_30, epoch + 1) accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp = perform_val( EMBEDDING_SIZE, per_batch_size, backbone, vgg2_fp, vgg2_fp_issame) buffer_val(writer, "VGGFace2_FP", accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp, epoch + 1) print( "Epoch {}/{}, Evaluation: LFW Acc: {}, CFP_FP Acc: {}, AgeDB Acc: {}, VGG2_FP Acc: {}" .format(epoch + 1, NUM_EPOCH, accuracy_lfw, accuracy_cfp_fp, accuracy_agedb_30, accuracy_vgg2_fp)) print("=" * 60) print("=" * 60) print("Save Checkpoint...") if cfg['RANK'] % ngpus_per_node == 0: #torch.save(backbone.module.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, get_time()))) #save_dict = {'EPOCH': epoch+1, # 'HEAD': head.module.state_dict(), # 'OPTIMIZER': optimizer.state_dict()} #torch.save(save_dict, os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, get_time()))) ori_backbone.load_state_dict(backbone.module.state_dict()) ori_backbone.eval() x = torch.randn(1, 3, 112, 112).cuda() traced_cell = torch.jit.trace(ori_backbone, (x)) #torch.save(ori_backbone, os.path.join(MODEL_ROOT, "model.pth")) torch.jit.save( traced_cell, os.path.join( MODEL_ROOT, "Epoch_{}_Time_{}_checkpoint.pth".format( epoch + 1, get_time()))) sys.stdout.flush() batch += 1 # batch index epoch_loss = losses.avg epoch_acc = top1.avg print("=" * 60) print('Epoch: {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, cfg['NUM_EPOCH'], loss=losses, top1=top1, top5=top5)) sys.stdout.flush() print("=" * 60) if cfg['RANK'] % ngpus_per_node == 0: writer.add_scalar("Training_Loss", epoch_loss, epoch + 1) writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1) writer.add_scalar("Top1", top1.avg, epoch + 1) writer.add_scalar("Top5", top5.avg, epoch + 1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--eid', type=int, default=-1) parser.add_argument('--gpu_id', type=int, nargs='+', default=0) parser.add_argument('--yaml_file', type=str, default='configs/demo/mini/20way_1shot.yaml') outside_opts = parser.parse_args() if isinstance(outside_opts.gpu_id, int): outside_opts.gpu_id = [outside_opts.gpu_id] # int -> list config = {} config['options'] = { 'ctrl.yaml_file': outside_opts.yaml_file, 'ctrl.gpu_id': outside_opts.gpu_id } opts = Config(config['options']['ctrl.yaml_file'], config['options']) opts.setup() # DATA meta_test = None train_db_list, val_db_list, _, _ = data_loader(opts) # MODEL # NOTE: we use cpu mode for demo; change to gpu for experiments net = CTMNet(opts).to(opts.ctrl.device) net_summary, param_num = model_summarize(net) opts.logger('Model size: param num # {:f} Mb'.format(param_num)) opts.model.param_size = param_num resume_model(net, opts) if opts.ctrl.multi_gpu: opts.logger('Wrapping network into multi-gpu mode ...') net = torch.nn.DataParallel(net) # OPTIM AND LR SCHEDULE if opts.train.optim == 'adam': optimizer = optim.Adam(net.parameters(), lr=opts.train.lr, weight_decay=opts.train.weight_decay) elif opts.train.optim == 'sgd': optimizer = optim.SGD(net.parameters(), lr=opts.train.lr, weight_decay=opts.train.weight_decay, momentum=opts.train.momentum) elif opts.train.optim == 'rmsprop': optimizer = optim.RMSprop(net.parameters(), lr=opts.train.lr, weight_decay=opts.train.weight_decay, momentum=opts.train.momentum, alpha=0.9, centered=True) if opts.train.lr_policy == 'multi_step': scheduler = MultiStepLR(optimizer, milestones=opts.train.lr_scheduler, gamma=opts.train.lr_gamma) elif opts.train.lr_policy == 'exp': scheduler = ExponentialLR(optimizer, gamma=opts.train.lr_gamma) if opts.model.structure == 'original': # ignore previous setting optimizer = optim.Adam(net.parameters(), lr=0.001) scheduler = StepLR(optimizer, step_size=100, gamma=0.5) opts.train.lr_policy = 'step' opts.train.step_size = 100 if not opts.data.use_ori_relation else 3 opts.train.lr_scheduler = [-1] opts.train.lr = 0.001 opts.train.lr_gamma = 0.5 opts.train.weight_decay = .0 # VISUALIZE if opts.misc.vis.use: if opts.misc.vis.method == 'tensorboard': NotImplementedError() elif opts.misc.vis.method == 'visdom': if opts.io.resume: try: vis = Visualizer(opts, net.previous_loss_data) except: vis = Visualizer(opts, net.module.previous_loss_data) else: vis = Visualizer(opts) if not opts.ctrl.eager: opts.print_args() opts.logger(net) else: opts.logger('config file is {:s}'.format(opts.ctrl.yaml_file)) opts.logger('configs not shown here in eager mode ...') opts.logger(net) # ############################################### # ################## PIPELINE ################### best_accuracy = opts.io.previous_acc RESET_BEST_ACC = True # for evolutionary train last_epoch, last_iter = opts.io.saved_epoch, opts.io.saved_iter opts.logger('CTM Pipeline starts now !!! (cpu demo purpose)') show_str = '[TRAIN FROM SCRATCH] LOG' if not opts.io.resume else '[RESUME] LOG' opts.logger('{}\n'.format(show_str)) total_ep = opts.train.nep if opts.ctrl.start_epoch > 0 or opts.ctrl.start_iter > 0: assert opts.io.resume RESUME = True else: RESUME = False for epoch in range(opts.ctrl.start_epoch, total_ep): if epoch > opts.ctrl.start_epoch and opts.data.change_on_every_ep: opts.logger('') opts.logger('Changing a new set of data at new epoch ...') train_db_list, val_db_list, _, _ = data_loader(opts) # adjust learning rate old_lr = optimizer.param_groups[0]['lr'] scheduler.step(epoch) new_lr = optimizer.param_groups[0]['lr'] if epoch == opts.ctrl.start_epoch: opts.logger('Start lr is {:.8f}, at epoch {}\n'.format( old_lr, epoch)) if new_lr != old_lr: opts.logger( 'LR changes from {:.8f} to {:.8f} at epoch {:d}\n'.format( old_lr, new_lr, epoch)) # select proper train_db (legacy reason) which_ind = 0 curr_shot = opts.fsl.k_shot[0] curr_query = opts.fsl.k_query[ 0] # only for display (for evolutionary train) train_db = train_db_list[0] val_db = val_db_list[0] total_iter = opts.ctrl.total_iter_train[0] eval_length = opts.ctrl.total_iter_val[0] for step, batch in enumerate(train_db): step_t = time.time() if RESUME: if step < opts.ctrl.start_iter: continue else: RESUME = False if step >= total_iter: break support_x, support_y, query_x, query_y = process_input( batch, opts, mode='train') loss, _ = net.forward_CTM(support_x, support_y, query_x, query_y, True) loss = loss.mean(0) vis_loss = loss.data.cpu().numpy() vis_loss *= opts.train.total_loss_fac loss *= opts.train.total_loss_fac if len(loss) > 1: total_loss = loss[0] else: total_loss = loss optimizer.zero_grad() total_loss.backward() if opts.train.clip_grad: # doesn't affect that much torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5) optimizer.step() iter_time = (time.time() - step_t) left_time = compute_left_time(iter_time, epoch, total_ep, step, total_iter) # SHOW TRAIN LOSS if step % opts.io.iter_vis_loss == 0 or step == total_iter - 1: opts.logger( opts.io.loss_vis_str.format(epoch, total_ep, step, total_iter, total_loss.item())) # time if step % 1000 * opts.io.iter_vis_loss == 0 or step == total_iter - 1: opts.logger( opts.io.time_vis_str.format(left_time[0], left_time[1], left_time[2])) # VALIDATION and SAVE BEST MODEL if epoch > opts.test.do_after_ep and \ ((step % opts.io.iter_do_val == 0 and step > 0) or step == total_iter - 1): # execute once only if RESET_BEST_ACC and opts.fsl.evolution and epoch >= opts.fsl.epoch_schedule[ -1]: best_accuracy, last_epoch, last_iter = -1.0, -1, -1 RESET_BEST_ACC = False arguments = { 'step': step, 'epoch': epoch, 'eval_length': eval_length, 'which_ind': which_ind, 'curr_shot': curr_shot, 'curr_query': curr_query, 'best_accuracy': best_accuracy, 'last_epoch': last_epoch, 'last_iter': last_iter, 'new_lr': new_lr, 'train_db': train_db, 'total_iter': total_iter, 'optimizer': optimizer, 'meta_test': meta_test } try: stats = run_test(opts, val_db, net, vis, **arguments) except RuntimeError: vis.show_dynamic_info(phase='error') if sum(stats) != -1: best_accuracy, last_epoch, last_iter = stats[0], stats[ 1], stats[2] # DONE with validation process opts.logger('') opts.logger('Training done! check your work using:') if opts.misc.vis.use and opts.misc.vis.method == 'visdom': vis.show_dynamic_info(phase='train_finish') if not opts.ctrl.eager: opts.logger('visdom state saved!') vis.save()
def main(args: argparse.Namespace): logger = CompleteLogger(args.log, args.phase) print(args) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') cudnn.benchmark = True # Data loading code normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) train_transform = T.Compose([ T.RandomRotation(args.rotation), T.RandomResizedCrop(size=args.image_size, scale=args.resize_scale), T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25), T.GaussianBlur(), T.ToTensor(), normalize ]) val_transform = T.Compose( [T.Resize(args.image_size), T.ToTensor(), normalize]) image_size = (args.image_size, args.image_size) heatmap_size = (args.heatmap_size, args.heatmap_size) source_dataset = datasets.__dict__[args.source] train_source_dataset = source_dataset(root=args.source_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_source_loader = DataLoader(train_source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_source_dataset = source_dataset(root=args.source_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_source_loader = DataLoader(val_source_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) target_dataset = datasets.__dict__[args.target] train_target_dataset = target_dataset(root=args.target_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_target_loader = DataLoader(train_target_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_target_dataset = target_dataset(root=args.target_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_target_loader = DataLoader(val_target_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) print("Source train:", len(train_source_loader)) print("Target train:", len(train_target_loader)) print("Source test:", len(val_source_loader)) print("Target test:", len(val_target_loader)) train_source_iter = ForeverDataIterator(train_source_loader) train_target_iter = ForeverDataIterator(train_target_loader) # create model model = models.__dict__[args.arch]( num_keypoints=train_source_dataset.num_keypoints).to(device) criterion = JointsMSELoss() # define optimizer and lr scheduler optimizer = Adam(model.get_parameters(lr=args.lr)) lr_scheduler = MultiStepLR(optimizer, args.lr_step, args.lr_factor) # optionally resume from a checkpoint start_epoch = 0 if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) start_epoch = checkpoint['epoch'] + 1 # define visualization function tensor_to_image = Compose([ Denormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ToPILImage() ]) def visualize(image, keypoint2d, name): """ Args: image (tensor): image in shape 3 x H x W keypoint2d (tensor): keypoints in shape K x 2 name: name of the saving image """ train_source_dataset.visualize( tensor_to_image(image), keypoint2d, logger.get_image_path("{}.jpg".format(name))) if args.phase == 'test': # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize, args) print("Source: {:4.3f} Target: {:4.3f}".format(source_val_acc['all'], target_val_acc['all'])) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) return # start training best_acc = 0 for epoch in range(start_epoch, args.epochs): logger.set_epoch(epoch) lr_scheduler.step() # train for one epoch train(train_source_iter, train_target_iter, model, criterion, optimizer, epoch, visualize if args.debug else None, args) # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize if args.debug else None, args) # remember best acc and save checkpoint torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, logger.get_checkpoint_path(epoch)) if target_val_acc['all'] > best_acc: shutil.copy(logger.get_checkpoint_path(epoch), logger.get_checkpoint_path('best')) best_acc = target_val_acc['all'] print("Source: {:4.3f} Target: {:4.3f} Target(best): {:4.3f}".format( source_val_acc['all'], target_val_acc['all'], best_acc)) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) logger.close()
np.random.seed(42) torch.manual_seed(42) if torch.cuda.is_available(): torch.cuda.manual_seed_all(42) vgg16_cifar100 = models.vgg16_bn(pretrained=False, **{'num_classes': 100}) vgg16_cifar100 = vgg16_cifar100.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(vgg16_cifar100.parameters(), lr=0.01, weight_decay=0.0005, momentum=0.9) sched = MultiStepLR(optimizer, milestones=[20, 30], gamma=0.1) train_log, val_log, vgg16_cifar100_1_acc, vgg16_cifar100 = train( vgg16_cifar100, optimizer, criterion, dataset_loader_train, dataset_loader_test, 40, sched, 31, 1, 100) torch.save(vgg16_cifar100.state_dict(), 'vgg16_cifar100_40ep_1.pt') # Second model: random.seed(8) np.random.seed(8) torch.manual_seed(8) if torch.cuda.is_available(): torch.cuda.manual_seed_all(8) vgg16_cifar100 = models.vgg16_bn(pretrained=False, **{'num_classes': 100})
def train(args, dataset_train, rnn, output, node_f_gen=None, edge_f_gen=None): # check if load existing model if args.load: fname = args.model_save_path + args.fname + 'lstm_' + str( args.load_epoch) + '.dat' rnn.load_state_dict(torch.load(fname)) fname = args.model_save_path + args.fname + 'output_' + str( args.load_epoch) + '.dat' output.load_state_dict(torch.load(fname)) args.lr = 0.00001 epoch = args.load_epoch print('model loaded!, lr: {}'.format(args.lr)) else: epoch = 1 # initialize optimizer optimizer_rnn = optim.Adam(list(rnn.parameters()), lr=args.lr) optimizer_output = optim.Adam(list(output.parameters()), lr=args.lr) scheduler_rnn = MultiStepLR(optimizer_rnn, milestones=args.milestones, gamma=args.lr_rate) scheduler_output = MultiStepLR(optimizer_output, milestones=args.milestones, gamma=args.lr_rate) # start main loop time_all = np.zeros(args.epochs) while epoch <= args.epochs: time_start = tm.time() # train if 'GraphRNN_VAE' in args.note: train_vae_epoch(epoch, args, rnn, output, dataset_train, optimizer_rnn, optimizer_output, scheduler_rnn, scheduler_output) elif 'GraphRNN_MLP' in args.note: train_mlp_epoch(epoch, args, rnn, output, dataset_train, optimizer_rnn, optimizer_output, scheduler_rnn, scheduler_output) elif 'GraphRNN_RNN' in args.note: train_rnn_epoch(epoch, args, rnn, output, dataset_train, optimizer_rnn, optimizer_output, scheduler_rnn, scheduler_output, node_f_gen, edge_f_gen) time_end = tm.time() time_all[epoch - 1] = time_end - time_start # test if epoch % args.epochs_test == 0 and epoch >= args.epochs_test_start: for sample_time in range(1, 4): G_pred = [] while len(G_pred) < args.test_total_size: if 'GraphRNN_VAE' in args.note: G_pred_step = test_vae_epoch( epoch, args, rnn, output, test_batch_size=args.test_batch_size, sample_time=sample_time) elif 'GraphRNN_MLP' in args.note: G_pred_step = test_mlp_epoch( epoch, args, rnn, output, test_batch_size=args.test_batch_size, sample_time=sample_time) elif 'GraphRNN_RNN' in args.note: G_pred_step = test_rnn_epoch( epoch, args, rnn, output, node_f_gen, test_batch_size=args.test_batch_size) G_pred.extend(G_pred_step) # save graphs fname = args.graph_save_path + args.fname_pred + str( epoch) + '_' + str(sample_time) + '.dat' save_graph_list(G_pred, fname) if 'GraphRNN_RNN' in args.note: break print('test done, graphs saved') # save model checkpoint if args.save: if epoch % args.epochs_save == 0: fname = args.model_save_path + args.fname + 'lstm_' + str( epoch) + '.dat' torch.save(rnn.state_dict(), fname) fname = args.model_save_path + args.fname + 'output_' + str( epoch) + '.dat' torch.save(output.state_dict(), fname) epoch += 1 np.save(args.timing_save_path + args.fname, time_all)
iterator.index_with(vocab) val_iterator = AdvancedBucketIterator( batch_size=2, sorting_keys=[("sentence", "num_tokens")], ) val_iterator.index_with(vocab) USE_CUDA = True if USE_CUDA: model = model.cuda() num_epochs = 30 learning_rate_scheduler = LearningRateWithoutMetricsWrapper( MultiStepLR(optimizer, [10, 20, 40], gamma=0.25, last_epoch=-1)) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, validation_iterator=val_iterator, #train_dataset=dataset_train_no_punct + datasets['train'], train_dataset=dataset_all_no_punct + datasets['all'], # validation_dataset=datasets['val'], patience=10, num_epochs=num_epochs, learning_rate_scheduler=learning_rate_scheduler, model_save_interval=10, cuda_device=0) trainer.train()
print('Train epoch: {:.0f}, it: {:.0f}, loss: {:.4f}, loss_hr: {:.4f}, loss_img: {:.4f}, loss_cross: {:.4f}, loss_snr: {:.4f}'.format(epoch, batch_idx, loss, loss_hr, loss_img, loss_cross, loss_SNR)); def test(): net.eval() test_loss = 0; for (data, hr, fps, bvp, idx) in test_loader: data = Variable(data); hr = Variable(hr.view(-1,1)); data, hr = data.cuda(), hr.cuda(); feat_hr, feat_n, output, img_out, feat_hrf1, feat_nf1, hrf1, idx1, feat_hrf2, feat_nf2, hrf2, idx2, ecg, ecg1, ecg2 = net(data, epoch); loss = lossfunc_HR(output, hr); test_loss += loss.item(); begin_epoch = 1; scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.5) for epoch in range(begin_epoch, epoch_num + 1): if epoch > 20: train_dataset.transform = transforms.Compose([resize, toTensor]); train_dataset.VerticalFlip = False; train_loader = DataLoader(train_dataset, batch_size=batch_size_num, shuffle=True, num_workers=4); train(); test();
lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) else: optimizer = optim.Adam(params, lr=args.learning_rate, weight_decay=args.weight_decay) if args.scheduler == "CosineAnnealing": scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=args.max_epochs, eta_min=0) else: scheduler = MultiStepLR(optimizer, milestones=args.milestones, gamma=0.2) #############################################RESTART/RESTORE/RESUME################# restore_fields = { "model": model if not isinstance(model, nn.DataParallel) else model.module, "optimizer": optimizer, "scheduler": scheduler, } start_epoch = 0 if args.resume: resume_epoch = restore_model( restore_fields,
def tc_trans2(): global args, best_mae_error # load data dataset = CIFData(*args.data_options) collate_fn = collate_pool # obtain target value normalizer if args.task == 'classification': normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({'mean': 0., 'std': 1.}) else: if len(dataset) < 500: warnings.warn('Dataset has less than 500 data points. ' 'Lower accuracy is expected. ') sample_data_list = [dataset[i] for i in range(len(dataset))] else: sample_data_list = [ dataset[i] for i in sample(range(len(dataset)), 500) ] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model_a = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == 'classification' else False) model_b = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == 'classification' else False) model = SimpleNN(in_feature=256, out_feature=1) # pretrained model path model_a_path = '../pre-trained/research-model/bulk_moduli-model_best.pth.tar' model_b_path = '../pre-trained/research-model/sps-model_best.pth.tar' # load latest model state ckpt_a = torch.load(model_a_path) ckpt_b = torch.load(model_b_path) # load model model_a.load_state_dict(ckpt_a['state_dict']) model_b.load_state_dict(ckpt_b['state_dict']) def get_activation_a(name, activation_a): def hook(model, input, output): activation_a[name] = output.detach() return hook def get_activation_b(name, activation_b): def hook(model, input, output): activation_b[name] = output.detach() return hook if args.cuda: model_a.cuda() model_b.cuda() model.cuda() activation_a = {} activation_b = {} # hook the activation function model_a.conv_to_fc.register_forward_hook( get_activation_a('conv_to_fc', activation_a)) model_b.conv_to_fc.register_forward_hook( get_activation_b('conv_to_fc', activation_b)) # define loss func and optimizer if args.task == 'classification': criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: raise NameError('Only SGD or Adam is allowed as --optim') # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_mae_error = checkpoint['best_mae_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) normalizer.load_state_dict(checkpoint['normalizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) X = torch.Tensor() T = torch.Tensor() for i in range(5): total_size = len(dataset) indices = list(range(total_size)) batch_size = args.batch_size num_workers = args.workers pin_memory = args.cuda if i == 0: train_sampler = SubsetRandomSampler(indices[:61]) test_sampler = SubsetRandomSampler(indices[-16:]) if i == 1: x = indices[:45] y = x.extend(indices[-16:]) train_samplre = SubsetRandomSampler(y) test_sampler = SubsetRandomSampler(indices[45:-16]) if i == 2: x = indices[:29] y = x.extend(indices[-32:]) train_samplre = SubsetRandomSampler(y) test_sampler = SubsetRandomSampler(indices[29:-32]) if i == 3: x = indices[:13] y = x.extend(indices[-48:]) train_samplre = SubsetRandomSampler(y) test_sampler = SubsetRandomSampler(indices[13:-48]) if i == 4: y = indices[-64:] train_samplre = SubsetRandomSampler(y) test_sampler = SubsetRandomSampler(indices[:-64]) train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory) test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory) print(test_sampler) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(args, train_loader, model_a, model_b, model, activation_a, activation_b, criterion, optimizer, epoch, normalizer) # evaluate on validation set mae_error = validate(args, train_loader, model_a, model_b, model, activation_a, activation_b, criterion, normalizer) if mae_error != mae_error: print('Exit due to NaN') sys.exit(1) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == 'regression': is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_mae_error': best_mae_error, 'optimizer': optimizer.state_dict(), 'normalizer': normalizer.state_dict(), 'args': vars(args) }, is_best, prop=args.property) # test best model print('---------Evaluate Model on Test Set---------------') best_checkpoint = torch.load('../result/' + args.property + '-model_best.pth.tar') model.load_state_dict(best_checkpoint['state_dict']) x, t = validate(args, test_loader, model_a, model_b, model, activation_a, activation_b, criterion, normalizer, test=True, tc=True) X = torch.cat((X, x), dim=0) T = torch.cat((T, t), dim=0) x, t = X.numpy(), T.numpy() n_max = max(np.max(x), np.max(t)) n_min = min(np.min(x), np.min(t)) a = np.linspace(n_min - abs(n_max), n_max + abs(n_max)) b = a plt.rcParams["font.family"] = "Times New Roman" plt.plot(a, b, color='blue') plt.scatter(t, x, marker=".", color='red', edgecolors='black') plt.xlim(n_min - abs(n_min), n_max + abs(n_min)) plt.ylim(n_min - abs(n_min), n_max + abs(n_min)) plt.title( "Thermal Conductivity Prediction by CGCNN with Combined Model Transfer Learning" ) plt.xlabel("observation") plt.ylabel("prediction") plt.show()
def train(args, model, device, train_loader_creator, test_loader_creator, logger): criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) for task_idx, train_loader in enumerate(train_loader_creator.data_loaders): for param_group in optimizer.param_groups: param_group['lr'] = args.lr scheduler = MultiStepLR(optimizer, milestones=args.milestones, gamma=args.gamma) for epoch in range(1,args.epochs+1): model.train() losses = AverageMeter() acc = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() for batch_idx, (data, target) in enumerate(train_loader): data_time.update(time.time() - end) data, target = data.to(device), target.to(device) optimizer.zero_grad() _, output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() it_acc = accuracy(output.data, target)[0] losses.update(loss.item(), data.size(0)) acc.update(it_acc.item(), data.size(0)) batch_time.update(time.time() - end) end = time.time() if batch_idx % args.log_interval == 0: logger.info('Train Task: {0} Epoch: [{1:3d}][{2:3d}/{3:3d}]\t' 'DTime {data_time.avg:.3f}\t' 'BTime {batch_time.avg:.3f}\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( task_idx+1, epoch, batch_idx, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=acc)) scheduler.step() if epoch % args.test_interval == 0: test(args, model, device, test_loader_creator, logger) # plot_embedding_tsne(args, task_idx, test_loader_creator, model, device) if args.save_model: model_path = args.vis_base_dir.split('/')[-2] + 'T' + str(task_idx+1) + '.pt' if isinstance(model, torch.nn.DataParallel): torch.save(model.module.state_dict(), model_path) else: torch.save(model.state_dict(), model_path)
train_indices, valid_indices, test_indices, args) # margin and equilibirum margin = 0.35 equilibrium = 0.68 # OPTIM-LOSS optimizer_encoder = optim.RMSprop(params=net.encoder.parameters(), lr=lr, alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False) optimizer_decoder = optim.RMSprop(params=net.decoder.parameters(), lr=lr, alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False) optimizer_discriminator = optim.RMSprop(params=net.discriminator.parameters(), lr=lr, alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False) Steps = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] lr_encoder = MultiStepLR(optimizer_encoder, milestones=Steps, gamma=decay_lr) lr_decoder = MultiStepLR(optimizer_decoder, milestones=Steps, gamma=decay_lr) lr_discriminator = MultiStepLR(optimizer_discriminator, milestones=Steps, gamma=decay_lr) count_update_step = 0 for i in range(n_epochs): for j, sample_batched in enumerate(dataloader): net.train() # target and input are the same images data = sample_batched['image'] batch_x = data.cuda() # get output
def main(): """Perform training, validation, and testing, with checkpoint loading and saving""" # Build Model print("==> Building model..") base_model = ResNet34() if args.compress: model = FeatherNet( base_model, compress=args.compress, ) else: if args.lr != 0.1: print("Warning: Suggest setting base-model learning rate to 0.1") model = base_model # Enable GPU support print("==> Setting up device..") if torch.cuda.is_available(): print("Utilizing", torch.cuda.device_count(), "GPU(s)!") if torch.cuda.device_count() > 1: model = nn.DataParallel(model) DEV = torch.device("cuda:0") cuda_kwargs = {"num_workers": args.num_workers, "pin_memory": True} cudnn.benchmark = True else: print("Utilizing CPU!") DEV = torch.device("cpu") cuda_kwargs = {} model.to(DEV) # Create dataloaders print("==> Preparing data..") train_loader, valid_loader = get_train_valid_loader( data_dir=args.data_dir, batch_size=args.batch_size, valid_size=args.valid_size, **cuda_kwargs ) test_loader = get_test_loader(data_dir=args.data_dir, **cuda_kwargs) best_acc = 0 # best validation accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch save_display = False # Load checkpoint if args.resume: print("==> Resuming from checkpoint..") assert os.path.isdir("checkpoint"), "Error: no checkpoint directory found!" checkpoint = torch.load("./checkpoint/" + args.ckpt_name) model.load_state_dict(checkpoint["model"]) best_acc = checkpoint["acc"] start_epoch = checkpoint["epoch"] # Initialize optimizers and loss fn criterion = nn.CrossEntropyLoss() optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4 ) scheduler = MultiStepLR(optimizer, milestones=[100, 200], gamma=0.1) def train(epoch: int) -> None: """Train on CIFAR10 per epoch""" # maintain backward compatibility; get_last_lr requires PyTorch >= 1.4 last_lr = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) print( "\nEpoch: {} | Compression: {:.2f} | lr: {:<6}".format( epoch, args.compress, last_lr ) ) model.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(train_loader): inputs, targets = inputs.to(DEV), targets.to(DEV) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, len(train_loader), "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format( train_loss / (batch_idx + 1), 100.0 * correct / total, correct, total, ), ) # Validation def validate(epoch: int) -> None: """Validate on CIFAR10 per epoch. Save best accuracy for checkpoint storing""" nonlocal best_acc nonlocal save_display model.eval() valid_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(valid_loader): inputs, targets = inputs.to(DEV), targets.to(DEV) outputs = model(inputs) loss = criterion(outputs, targets) valid_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, len(valid_loader), "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format( valid_loss / (batch_idx + 1), 100.0 * correct / total, correct, total, ), ) # Save checkpoint. acc = 100.0 * correct / total save_display = acc > best_acc if acc > best_acc: state = { "model": model.state_dict(), "acc": acc, "epoch": epoch, } if not os.path.isdir("checkpoint"): os.mkdir("checkpoint") torch.save(state, "./checkpoint/" + args.ckpt_name) best_acc = acc # Testing def test(epoch: int) -> None: """Test on CIFAR10 per epoch.""" model.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): inputs, targets = inputs.to(DEV), targets.to(DEV) outputs = model(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, len(test_loader), "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format( test_loss / (batch_idx + 1), 100.0 * correct / total, correct, total, ), ) # Train up to 300 epochs # *Displays* concurent performance on validation and test set while training, # but strictly uses validation set to determine early stopping print("==> Initiate Training..") for epoch in range(start_epoch, 300): train(epoch) validate(epoch) test(epoch) if save_display: print("Saving..") scheduler.step()
def __init__(self, config, model, trn_data, val_data=None): self.config = config self.model = model.cuda() self.trn_data = DataFetcher(trn_data) self.val_data = val_data #create the optimizer if config['optim'] == 'SGD': self.optimizer = SGD(model.parameters(), lr=config['lr'], momentum=config['momentum'], weight_decay=config['wd']) elif config['optim'] == 'AdamW': self.optimizer = AdamW( model.parameters(), lr=config['lr'], weight_decay=config['wd']) #momentum is default else: optim = config['optim'] raise Exception( f'Optimizer {optim} is not supported! Must be SGD or AdamW') #create the learning rate scheduler schedule = config['lr_policy'] if schedule == 'OneCycle': self.scheduler = OneCycleLR(self.optimizer, config['lr'], total_steps=config['iters']) elif schedule == 'MultiStep': self.scheduler = MultiStepLR(self.optimizer, milestones=config['lr_decay_epochs']) elif schedule == 'Poly': func = lambda iteration: (1 - (iteration / config['iters']) )**config['power'] self.scheduler = LambdaLR(self.optimizer, func) else: lr_policy = config['lr_policy'] raise Exception( f'Policy {lr_policy} is not supported! Must be OneCycle, MultiStep or Poly' ) #create the loss criterion if config['num_classes'] > 1: #load class weights if they were given in the config file if 'class_weights' in config: weight = torch.Tensor(config['class_weights']).float().cuda() else: weight = None self.criterion = nn.CrossEntropyLoss(weight=weight).cuda() else: self.criterion = nn.BCEWithLogitsLoss().cuda() #define train and validation metrics and class names class_names = config['class_names'] #make training metrics using the EMAMeter. this meter gives extra #weight to the most recent metric values calculated during training #this gives a better reflection of how well the model is performing #when the metrics are printed trn_md = { name: metric_lookup[name](EMAMeter()) for name in config['metrics'] } self.trn_metrics = ComposeMetrics(trn_md, class_names) self.trn_loss_meter = EMAMeter() #the only difference between train and validation metrics #is that we use the AverageMeter. this is because there are #no weight updates during evaluation, so all batches should #count equally val_md = { name: metric_lookup[name](AverageMeter()) for name in config['metrics'] } self.val_metrics = ComposeMetrics(val_md, class_names) self.val_loss_meter = AverageMeter() self.logging = config['logging'] #now, if we're resuming from a previous run we need to load #the state for the model, optimizer, and schedule and resume #the mlflow run (if there is one and we're using logging) if config['resume']: self.resume(config['resume']) elif self.logging: #if we're not resuming, but are logging, then we #need to setup mlflow with a new experiment #everytime that Trainer is instantiated we want to #end the current active run and let a new one begin mlflow.end_run() #extract the experiment name from config so that #we know where to save our files, if experiment name #already exists, we'll use it, otherwise we create a #new experiment mlflow.set_experiment(self.config['experiment_name']) #add the config file as an artifact mlflow.log_artifact(config['config_file']) #we don't want to add everything in the config #to mlflow parameters, we'll just add the most #likely to change parameters mlflow.log_param('lr_policy', config['lr_policy']) mlflow.log_param('optim', config['optim']) mlflow.log_param('lr', config['lr']) mlflow.log_param('wd', config['wd']) mlflow.log_param('bsz', config['bsz']) mlflow.log_param('momentum', config['momentum']) mlflow.log_param('iters', config['iters']) mlflow.log_param('epochs', config['epochs']) mlflow.log_param('encoder', config['encoder']) mlflow.log_param('finetune_layer', config['finetune_layer']) mlflow.log_param('pretraining', config['pretraining'])
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive # test_display_triplet_distance = False # print the experiment configuration print('\nCurrent time is \33[91m{}\33[0m.'.format(str(time.asctime()))) print('Parsed options: {}'.format(vars(args))) print('Number of Speakers: {}.\n'.format(train_dir.num_spks)) model_kwargs = { 'embedding_size': args.embedding_size, 'num_classes': train_dir.num_spks, 'input_dim': args.feat_dim, 'dropout_p': args.dropout_p } print('Model options: {}'.format(model_kwargs)) model = create_model(args.model, **model_kwargs) # model = ASTDNN(num_classes=train_dir.num_spks, input_dim=args.feat_dim, # embedding_size=args.embedding_size, # dropout_p=args.dropout_p) start_epoch = 0 if args.save_init: check_path = '{}/checkpoint_{}.pth'.format(args.check_path, start_epoch) torch.save(model, check_path) if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] filtered = { k: v for k, v in checkpoint['state_dict'].items() if 'num_batches_tracked' not in k } model_dict = model.state_dict() model_dict.update(filtered) model.load_state_dict(model_dict) # try: model.dropout.p = args.dropout_p except: pass else: print('=> no checkpoint found at {}'.format(args.resume)) ce_criterion = nn.CrossEntropyLoss() if args.loss_type == 'soft': xe_criterion = None elif args.loss_type == 'asoft': ce_criterion = None model.classifier = AngleLinear(in_features=args.embedding_size, out_features=train_dir.num_spks, m=args.m) xe_criterion = AngleSoftmaxLoss(lambda_min=args.lambda_min, lambda_max=args.lambda_max) elif args.loss_type == 'center': xe_criterion = CenterLoss(num_classes=train_dir.num_spks, feat_dim=args.embedding_size) elif args.loss_type == 'amsoft': ce_criterion = None model.classifier = AdditiveMarginLinear(feat_dim=args.embedding_size, n_classes=train_dir.num_spks) xe_criterion = AMSoftmaxLoss(margin=args.margin, s=args.s) optimizer = create_optimizer(model.parameters(), args.optimizer, **opt_kwargs) if args.loss_type == 'center': optimizer = torch.optim.SGD([{ 'params': xe_criterion.parameters(), 'lr': args.lr * 5 }, { 'params': model.parameters() }], lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum) if args.finetune: if args.loss_type == 'asoft' or args.loss_type == 'amsoft': classifier_params = list(map(id, model.classifier.parameters())) rest_params = filter(lambda p: id(p) not in classifier_params, model.parameters()) optimizer = torch.optim.SGD( [{ 'params': model.classifier.parameters(), 'lr': args.lr * 5 }, { 'params': rest_params }], lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum) if args.scheduler == 'exp': scheduler = ExponentialLR(optimizer, gamma=args.gamma) else: milestones = args.milestones.split(',') milestones = [int(x) for x in milestones] milestones.sort() scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1) ce = [ce_criterion, xe_criterion] start = args.start_epoch + start_epoch print('Start epoch is : ' + str(start)) # start = 0 end = start + args.epochs train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, collate_fn=PadCollate( dim=2, fix_len=False, min_chunk_size=250, max_chunk_size=450), shuffle=True, **kwargs) valid_loader = torch.utils.data.DataLoader( valid_dir, batch_size=int(args.batch_size / 2), collate_fn=PadCollate(dim=2, fix_len=False, min_chunk_size=250, max_chunk_size=450), shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_dir, batch_size=args.test_batch_size, shuffle=False, **kwargs) # sitw_test_loader = torch.utils.data.DataLoader(sitw_test_dir, batch_size=args.test_batch_size, # shuffle=False, **kwargs) # sitw_dev_loader = torch.utils.data.DataLoader(sitw_dev_part, batch_size=args.test_batch_size, shuffle=False, # **kwargs) if args.cuda: model = model.cuda() for i in range(len(ce)): if ce[i] != None: ce[i] = ce[i].cuda() for epoch in range(start, end): # pdb.set_trace() print('\n\33[1;34m Current \'{}\' learning rate is '.format( args.optimizer), end='') for param_group in optimizer.param_groups: print('{:.5f} '.format(param_group['lr']), end='') print(' \33[0m') train(train_loader, model, ce, optimizer, epoch) test(test_loader, valid_loader, model, epoch) # sitw_test(sitw_test_loader, model, epoch) # sitw_test(sitw_dev_loader, model, epoch) scheduler.step() # exit(1) writer.close()
k, batch_size, epochs = args.k, args.batch_size, args.epochs # data prepare train_data = utils.CIFAR10Instance(root='data', train=True, transform=utils.train_transform, download=True) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8) memory_data = utils.CIFAR10Instance(root='data', train=True, transform=utils.test_transform, download=True) memory_loader = DataLoader(memory_data, batch_size=batch_size, shuffle=False, num_workers=8) test_data = utils.CIFAR10Instance(root='data', train=False, transform=utils.test_transform, download=True) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=8) # model setup and optimizer config model = Model(feature_dim).to('cuda') optimizer = optim.SGD(model.parameters(), lr=0.03, momentum=0.9, weight_decay=5e-4) print("# trainable model parameters:", sum(param.numel() if param.requires_grad else 0 for param in model.parameters())) lr_scheduler = MultiStepLR(optimizer, milestones=[int(epochs * 0.6), int(epochs * 0.8)], gamma=0.1) # z as normalizer, init with None, c as num of train class, n as num of train data z, c, n = None, len(memory_data.classes), len(train_data) # init memory bank as unit random vector ---> [N, D] memory_bank = F.normalize(torch.randn(n, feature_dim), dim=-1) # training loop results = {'train_loss': [], 'test_acc@1': [], 'test_acc@5': []} best_acc = 0.0 for epoch in range(1, epochs + 1): train_loss = train(model, train_loader, optimizer) results['train_loss'].append(train_loss) test_acc_1, test_acc_5 = test(model, memory_loader, test_loader) results['test_acc@1'].append(test_acc_1) results['test_acc@5'].append(test_acc_5)
# net = DenseNet121() # net = ResNeXt29_2x64d() # net = MobileNet() net = MobileNetV2() # net = DPN92() # net = ShuffleNetG2() # net = SENet18() if use_cuda: net.cuda() net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[150,250], gamma=0.1) # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 epoch_steps = stream.size() // BATCH for batch_idx in range(epoch_steps): meta, inputs = stream.next() targets = meta.labels.astype(np.int64) inputs = torch.from_numpy(inputs) targets = torch.from_numpy(targets)
def main(): dataset_train = Dataset(data_path=opt.data_path, augment=True) loader_train = DataLoader(dataset=dataset_train, num_workers=4, batch_size=opt.batch_size, shuffle=True) print("# of training samples: %d\n" % int(len(dataset_train))) # Build model model = fusion_can_multiscale(recurrent_iter=opt.recurrent_iter, use_GPU=opt.use_gpu) print_network(model) # loss function criterion1 = SSIM() if opt.use_gpu: model = model.cuda() criterion1.cuda() # Optimizer for PreNet optimizer = optim.Adam(model.parameters(), lr=opt.lr) scheduler = MultiStepLR(optimizer, milestones=opt.milestone, gamma=0.1) # record training writer = SummaryWriter(opt.save_path) # load the lastest model initial_epoch = findLastCheckpoint(save_dir=opt.save_path) if initial_epoch > 0: print('resuming by loading epoch %d' % initial_epoch) model.load_state_dict( torch.load( os.path.join(opt.save_path, 'net_epoch%d.pth' % initial_epoch))) # start training step = 0 for epoch in range(initial_epoch, opt.epochs): scheduler.step(epoch) #update learning rate for param_group in optimizer.param_groups: print('learning rate %f' % param_group["lr"]) ## epoch training start for i, (input_train, target_train) in enumerate(loader_train, 0): model.train() #training mode of model model.zero_grad() optimizer.zero_grad() input_train, target_train = Variable(input_train), Variable( target_train) if opt.use_gpu: input_train, target_train = input_train.cuda( ), target_train.cuda() out_train = model(input_train) pixel_metric = criterion1(target_train, out_train) loss1 = -pixel_metric loss1.backward() optimizer.step() # training curve model.eval() #evaluation mode of model out_train = model(input_train) out_train = torch.clamp(out_train, 0., 1.) psnr_train = batch_PSNR(out_train, target_train, 1.) print( "[epoch %d][%d/%d] ssim_loss: %.4f, pixel_metric: %.4f, PSNR: %.4f" % (epoch + 1, i + 1, len(loader_train), loss1.item(), pixel_metric.item(), psnr_train)) if step % 10 == 0: # Log the scalar values writer.add_scalar('SSIM_loss', loss1.item(), step) #writer.add_scalar('learning_rate', loss2.item(), step) writer.add_scalar('PSNR on training data', psnr_train, step) step += 1 ## epoch training end # log the images model.eval() out_train = model(input_train) out_train = torch.clamp(out_train, 0., 1.) im_target = utils.make_grid(target_train.data, nrow=8, normalize=True, scale_each=True) im_input = utils.make_grid(input_train.data, nrow=8, normalize=True, scale_each=True) im_derain = utils.make_grid(out_train.data, nrow=8, normalize=True, scale_each=True) writer.add_image('Clean image', im_target, epoch + 1) writer.add_image('Rainy image', im_input, epoch + 1) writer.add_image('Derained image', im_derain, epoch + 1) # save model torch.save(model.state_dict(), os.path.join(opt.save_path, 'net_latest.pth')) if epoch % opt.save_freq == 0: torch.save( model.state_dict(), os.path.join(opt.save_path, 'net_epoch%d.pth' % (epoch + 1)))
def train(self, epoch, trainloader): self.model.train() train_loss = AverageMeter() prec = AverageMeter() # Declare optimizer. params = self.master_params if self.fp16_mode else self.model.parameters( ) optimizer = optim.SGD(params, self.args.lr, momentum=self.args.momentum, weight_decay=self.args.weight_decay) # learning rate scheduler scheduler = MultiStepLR(optimizer, milestones=[80, 120, 160, 180], gamma=0.1) # If epoch less than 5 use warmup, else use scheduler. if epoch < 5 and self.args.warm_up: lr = self.warmup_learning_rate(self.args.lr, self.args.epochs, epoch, len(trainloader)) for param_group in optimizer.param_groups: param_group['lr'] = lr else: scheduler.step(epoch=epoch) # Loss criterion is in FP32. criterion = nn.CrossEntropyLoss() with trange(len(trainloader)) as t: for idx, (inputs, targets) in enumerate(trainloader): if self.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() self.model.zero_grad() outputs = self.model(inputs) # We calculate the loss in FP32 since reduction ops can # be wrong when represented in FP16. loss = criterion(outputs, targets) # Sometime the loss may become small to be represente in FP16 # So we scale the losses by a large power of 2, 2**7 here. if self.loss_scaling: loss = loss * self._LOSS_SCALE # Calculate the gradients loss.backward() if self.fp16_mode: # Move the calculated gradients to the master params # so that we can apply the gradient update in FP32. self.model_grads_to_master_grads(self.model_params, self.master_params) if self.loss_scaling: # If we scaled our losses now is a good time to scale it # back since our gradients are in FP32. for params in self.master_params: params.grad.data = params.grad.data / self._LOSS_SCALE # Apply weight update in FP32. optimizer.step() # Copy the updated weights back FP16 model weights. self.master_params_to_model_params(self.model_params, self.master_params) else: optimizer.step() train_loss.update(loss.item() / self._LOSS_SCALE, inputs.size(0)) top1 = accuracy(outputs, targets)[0] prec.update(top1.item(), inputs.size(0)) metrics = { 'Epoch': f'{epoch + 1}', 'Loss': '%.2f' % train_loss.avg, 'Acc': '%.1f' % prec.avg, 'LR': '%.4f' % get_optim_lr(optimizer) } t.set_postfix(metrics) t.update() t.close() self.history['loss'].append(train_loss.avg) self.history['acc'].append(prec.avg)
def main(factor): global meter_loss global meter_psnr global scheduler global engine global epoch_num global psnr_value global loss_value global train_loader global val_loader global model global criterion global UPSCALE_FACTOR parser = argparse.ArgumentParser(description='Super Resolution Training') parser.add_argument('--upscale_factor', default=3, type=int, help='super resolution upscale factor') parser.add_argument('--num_epochs', default=100, type=int, help='super resolution epochs number') opt = parser.parse_args() UPSCALE_FACTOR = opt.upscale_factor NUM_EPOCHS = opt.num_epochs if factor != 3: UPSCALE_FACTOR = factor train_set = DatasetFromFolder('data/train', upscale_factor=UPSCALE_FACTOR, input_transform=transforms.ToTensor(), target_transform=transforms.ToTensor()) val_set = DatasetFromFolder('data/val', upscale_factor=UPSCALE_FACTOR, input_transform=transforms.ToTensor(), target_transform=transforms.ToTensor()) train_loader = DataLoader(dataset=train_set, num_workers=0, batch_size=64, shuffle=True) val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=64, shuffle=False) model = SPCNNet(upscale_factor=UPSCALE_FACTOR) criterion = nn.MSELoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() print('# upscale factor:', UPSCALE_FACTOR) print('# parameters:', sum(param.numel() for param in model.parameters())) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = MultiStepLR(optimizer, milestones=[30, 80], gamma=0.1) engine = Engine() meter_loss = tnt.meter.AverageValueMeter() meter_psnr = PSNRMeter() epoch_num = [] psnr_value = [] loss_value = [] engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.train(processor, train_loader, maxepoch=NUM_EPOCHS, optimizer=optimizer) plt.plot(epoch_num, psnr_value, lw=2, ls='-', label="PSNR--x"+str(UPSCALE_FACTOR), color="r", marker="+") plt.xlabel("epoch time(s)", fontsize=16, horizontalalignment="right") plt.ylabel("PSNR value", fontsize=16, horizontalalignment="right") plt.legend() plt.savefig('D:\大三上\数字图像处理\SR_Project\plots\PSNRx'+str(UPSCALE_FACTOR)+'.png') plt.show() plt.plot(epoch_num, loss_value, lw=2, ls='-', label="Loss--x"+str(UPSCALE_FACTOR), color="r", marker="+") plt.xlabel("epoch time(s)", fontsize=16, horizontalalignment="right") plt.ylabel("Loss value", fontsize=16, horizontalalignment="right") plt.legend() plt.savefig('D:\大三上\数字图像处理\SR_Project\plots\LOSSx'+str(UPSCALE_FACTOR)+'.png') plt.show()
def train(ds, fold, train_idx, val_idx, conf, val_ds=None, transforms=None, val_transforms=None): if conf.model_fqn.endswith('SeResnext50_32d4d_upsample'): model = dynamic_load(conf.model_fqn)( num_classes=conf.num_classes, num_channels=conf.num_channels, pretrained_file=(conf.pretrained_model if 'pretrained_model' in conf else None), ) else: model = dynamic_load(conf.model_fqn)( num_classes=conf.num_classes, num_channels=conf.num_channels, ) # save_path = u.prefix_path() + f'/working/sp5r2/models/weights/{conf.modelname}/fold{fold}' save_path = f'/wdata/working/sp5r2/models/weights/{conf.modelname}/fold{fold}' Path(save_path).mkdir(parents=True, exist_ok=True) # tfb_path = u.prefix_path() + f'/working/sp5r2/models/logs/{conf.modelname}/fold{fold}' tfb_path = f'/wdata/working/sp5r2/models/logs/{conf.modelname}/fold{fold}' Path(tfb_path).mkdir(parents=True, exist_ok=True) optimizer = dynamic_load(conf.optimizer_fqn) estimator = Estimator(model, optimizer, save_path, config=conf) estimator.lr_scheduler = MultiStepLR(estimator.optimizer, conf.lr_steps, gamma=conf.lr_gamma) if 'scheduler' in conf: scheduler_class = dynamic_load(conf.scheduler) if conf.scheduler.endswith('CosineAnnealingLR'): conf.scheduler_params['optimizer'] = estimator.optimizer estimator.lr_scheduler = scheduler_class(**conf.scheduler_params) callbacks = [ ModelSaver(1, ("fold" + str(fold) + "_best.pth"), best_only=True), ModelSaver(1, ("fold" + str(fold) + "_last.pth"), best_only=False), CheckpointSaver(1, ("fold" + str(fold) + "_checkpoint.pth")), CheckpointSaver( 1, ("fold" + str(fold) + "_ep{epoch}_{loss}_checkpoint.pth")), TensorBoard(tfb_path), ] if 'early_stopper_patience' in conf: callbacks.append(EarlyStopper(conf.early_stopper_patience)) trainer = PytorchTrain(estimator, conf=conf, fold=fold, callbacks=callbacks, no_eval_period=conf.get('no_eval_period', 0)) train_dataset = TrainDataset(ds, train_idx, conf, transforms=transforms, verbose=False) train_loader = PytorchDataLoader(train_dataset, batch_size=conf.batch_size, shuffle=True, drop_last=True, num_workers=conf.num_workers, pin_memory=True) val_dataset = ValDataset(val_ds if val_ds is not None else ds, val_idx, conf, transforms=val_transforms) val_loader = PytorchDataLoader( val_dataset, batch_size=conf.batch_size if not conf.ignore_target_size else 1, shuffle=False, drop_last=False, num_workers=conf.num_workers, pin_memory=True) trainer.fit(train_loader, val_loader, conf.nb_epoch)
center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) logging.info( f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch) else: logging.fatal(f"Unsupported Scheduler: {args.scheduler}.") parser.print_help(sys.stderr) sys.exit(1) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, args.num_epochs): scheduler.step() train(train_loader,
def train_model_residual_lowlight_rdn(): device = DEVICE #准备数据 train_set = HsiCubicTrainDataset('./data/train_lowlik04/') #print('trainset32 training example:', len(train_set32)) #train_set = HsiCubicTrainDataset('./data/train_lowlight/') #train_set_64 = HsiCubicTrainDataset('./data/train_lowlight_patchsize64/') #train_set_list = [train_set32, train_set_64] #train_set = ConcatDataset(train_set_list) #里面的样本大小必须是一致的,否则会连接失败 print('total training example:', len(train_set)) train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True) #加载测试label数据 mat_src_path = './data/test_lowlight/origin/soup_bigcorn_orange_1ms.mat' test_label_hsi = scio.loadmat(mat_src_path)['label'] #加载测试数据 batch_size = 1 #test_data_dir = './data/test_lowlight/cuk12/' test_data_dir = './data/test_lowlight/cuk04/' test_set = HsiCubicLowlightTestDataset(test_data_dir) test_dataloader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False) batch_size, channel, width, height = next(iter(test_dataloader))[0].shape band_num = len(test_dataloader) denoised_hsi = np.zeros((width, height, band_num)) save_model_path = './checkpoints/hsirnd_k04' if not os.path.exists(save_model_path): os.mkdir(save_model_path) #创建模型 net = HSIRDNECA(K) init_params(net) net = nn.DataParallel(net).to(device) #net = net.to(device) #创建优化器 #hsid_optimizer = optim.Adam(net.parameters(), lr=INIT_LEARNING_RATE, betas=(0.9, 0,999)) hsid_optimizer = optim.Adam(net.parameters(), lr=INIT_LEARNING_RATE) scheduler = MultiStepLR(hsid_optimizer, milestones=[200, 400], gamma=0.5) #定义loss 函数 #criterion = nn.MSELoss() best_psnr = 0 is_resume = RESUME #唤醒训练 if is_resume: path_chk_rest = dir_utils.get_last_path(save_model_path, 'model_latest.pth') model_utils.load_checkpoint(net, path_chk_rest) start_epoch = model_utils.load_start_epoch(path_chk_rest) + 1 model_utils.load_optim(hsid_optimizer, path_chk_rest) best_psnr = model_utils.load_best_psnr(path_chk_rest) for i in range(1, start_epoch): scheduler.step() new_lr = scheduler.get_lr()[0] print( '------------------------------------------------------------------------------' ) print("==> Resuming Training with learning rate:", new_lr) print( '------------------------------------------------------------------------------' ) global tb_writer tb_writer = get_summary_writer(log_dir='logs') gen_epoch_loss_list = [] cur_step = 0 first_batch = next(iter(train_loader)) best_epoch = 0 best_iter = 0 if not is_resume: start_epoch = 1 num_epoch = 600 mpsnr_list = [] for epoch in range(start_epoch, num_epoch + 1): epoch_start_time = time.time() scheduler.step() print('epoch = ', epoch, 'lr={:.6f}'.format(scheduler.get_lr()[0])) print(scheduler.get_lr()) gen_epoch_loss = 0 net.train() #for batch_idx, (noisy, label) in enumerate([first_batch] * 300): for batch_idx, (noisy, cubic, label) in enumerate(train_loader): #print('batch_idx=', batch_idx) noisy = noisy.to(device) label = label.to(device) cubic = cubic.to(device) hsid_optimizer.zero_grad() #denoised_img = net(noisy, cubic) #loss = loss_fuction(denoised_img, label) residual = net(noisy, cubic) alpha = 0.8 loss = recon_criterion(residual, label - noisy) #loss = alpha*recon_criterion(residual, label-noisy) + (1-alpha)*loss_function_mse(residual, label-noisy) #loss = recon_criterion(residual, label-noisy) loss.backward() # calcu gradient hsid_optimizer.step() # update parameter gen_epoch_loss += loss.item() if cur_step % display_step == 0: if cur_step > 0: print( f"Epoch {epoch}: Step {cur_step}: Batch_idx {batch_idx}: MSE loss: {loss.item()}" ) else: print("Pretrained initial state") tb_writer.add_scalar("MSE loss", loss.item(), cur_step) #step ++,每一次循环,每一个batch的处理,叫做一个step cur_step += 1 gen_epoch_loss_list.append(gen_epoch_loss) tb_writer.add_scalar("mse epoch loss", gen_epoch_loss, epoch) #scheduler.step() #print("Decaying learning rate to %g" % scheduler.get_last_lr()[0]) torch.save( { 'gen': net.state_dict(), 'gen_opt': hsid_optimizer.state_dict(), }, f"{save_model_path}/hsid_rdn_eca_l1_loss_600epoch_patchsize32_{epoch}.pth" ) #测试代码 net.eval() psnr_list = [] for batch_idx, (noisy_test, cubic_test, label_test) in enumerate(test_dataloader): noisy_test = noisy_test.type(torch.FloatTensor) label_test = label_test.type(torch.FloatTensor) cubic_test = cubic_test.type(torch.FloatTensor) noisy_test = noisy_test.to(DEVICE) label_test = label_test.to(DEVICE) cubic_test = cubic_test.to(DEVICE) with torch.no_grad(): residual = net(noisy_test, cubic_test) denoised_band = noisy_test + residual denoised_band_numpy = denoised_band.cpu().numpy().astype( np.float32) denoised_band_numpy = np.squeeze(denoised_band_numpy) denoised_hsi[:, :, batch_idx] = denoised_band_numpy if batch_idx == 49: residual_squeezed = torch.squeeze(residual, axis=0) denoised_band_squeezed = torch.squeeze(denoised_band, axis=0) label_test_squeezed = torch.squeeze(label_test, axis=0) noisy_test_squeezed = torch.squeeze(noisy_test, axis=0) tb_writer.add_image(f"images/{epoch}_restored", denoised_band_squeezed, 1, dataformats='CHW') tb_writer.add_image(f"images/{epoch}_residual", residual_squeezed, 1, dataformats='CHW') tb_writer.add_image(f"images/{epoch}_label", label_test_squeezed, 1, dataformats='CHW') tb_writer.add_image(f"images/{epoch}_noisy", noisy_test_squeezed, 1, dataformats='CHW') test_label_current_band = test_label_hsi[:, :, batch_idx] psnr = PSNR(denoised_band_numpy, test_label_current_band) psnr_list.append(psnr) mpsnr = np.mean(psnr_list) mpsnr_list.append(mpsnr) denoised_hsi_trans = denoised_hsi.transpose(2, 0, 1) test_label_hsi_trans = test_label_hsi.transpose(2, 0, 1) mssim = SSIM(denoised_hsi_trans, test_label_hsi_trans) sam = SAM(denoised_hsi_trans, test_label_hsi_trans) #计算pnsr和ssim print("=====averPSNR:{:.3f}=====averSSIM:{:.4f}=====averSAM:{:.3f}". format(mpsnr, mssim, sam)) tb_writer.add_scalars("validation metrics", { 'average PSNR': mpsnr, 'average SSIM': mssim, 'avarage SAM': sam }, epoch) #通过这个我就可以看到,那个epoch的性能是最好的 #保存best模型 if mpsnr > best_psnr: best_psnr = mpsnr best_epoch = epoch best_iter = cur_step torch.save( { 'epoch': epoch, 'gen': net.state_dict(), 'gen_opt': hsid_optimizer.state_dict(), }, f"{save_model_path}/hsid_rdn_eca_l1_loss_600epoch_patchsize32_best.pth" ) print( "[epoch %d it %d PSNR: %.4f --- best_epoch %d best_iter %d Best_PSNR %.4f]" % (epoch, cur_step, mpsnr, best_epoch, best_iter, best_psnr)) print( "------------------------------------------------------------------" ) print("Epoch: {}\tTime: {:.4f}\tLoss: {:.4f}\tLearningRate {:.6f}". format(epoch, time.time() - epoch_start_time, gen_epoch_loss, INIT_LEARNING_RATE)) print( "------------------------------------------------------------------" ) #保存当前模型 torch.save( { 'epoch': epoch, 'gen': net.state_dict(), 'gen_opt': hsid_optimizer.state_dict(), 'best_psnr': best_psnr, }, os.path.join(save_model_path, "model_latest.pth")) mpsnr_list_numpy = np.array(mpsnr_list) np.save(os.path.join(save_model_path, "mpsnr_per_epoch.npy"), mpsnr_list_numpy) tb_writer.close()
def main(args): model = load_config(args.model) dataset = load_config(args.dataset) device = torch.device('cuda' if model['common']['cuda'] else 'cpu') if model['common']['cuda'] and not torch.cuda.is_available(): sys.exit('Error: CUDA requested but not available') # if args.batch_size < 2: # sys.exit('Error: PSPNet requires more than one image for BatchNorm in Pyramid Pooling') os.makedirs(model['common']['checkpoint'], exist_ok=True) num_classes = len(dataset['common']['classes']) net = UNet(num_classes).to(device) if args.resume: path = os.path.join(model['common']['checkpoint'], args.resume) cuda = model['common']['cuda'] def map_location(storage, _): return storage.cuda() if cuda else storage.cpu() chkpt = torch.load(path, map_location=map_location) net.load_state_dict(chkpt) resume_at_epoch = int(args.resume[11:16]) else: resume_at_epoch = 0 if model['common']['cuda']: torch.backends.cudnn.benchmark = True net = DataParallel(net) optimizer = SGD(net.parameters(), lr=model['opt']['lr'], momentum=model['opt']['momentum']) scheduler = MultiStepLR(optimizer, milestones=model['opt']['milestones'], gamma=model['opt']['gamma']) weight = torch.Tensor(dataset['weights']['values']) for i in range(resume_at_epoch): scheduler.step() criterion = CrossEntropyLoss2d(weight=weight).to(device) # criterion = FocalLoss2d(weight=weight).to(device) train_loader, val_loader = get_dataset_loaders(model, dataset) num_epochs = model['opt']['epochs'] history = collections.defaultdict(list) for epoch in range(resume_at_epoch, num_epochs): print('Epoch: {}/{}'.format(epoch + 1, num_epochs)) train_hist = train(train_loader, num_classes, device, net, optimizer, scheduler, criterion) print('Train loss: {:.4f}, mean IoU: {:.4f}'.format(train_hist['loss'], train_hist['iou'])) for k, v in train_hist.items(): history['train ' + k].append(v) val_hist = validate(val_loader, num_classes, device, net, criterion) print('Validate loss: {:.4f}, mean IoU: {:.4f}'.format(val_hist['loss'], val_hist['iou'])) for k, v in val_hist.items(): history['val ' + k].append(v) visual = 'history-{:05d}-of-{:05d}.png'.format(epoch + 1, num_epochs) plot(os.path.join(model['common']['checkpoint'], visual), history) checkpoint = 'checkpoint-{:05d}-of-{:05d}.pth'.format(epoch + 1, num_epochs) torch.save(net.state_dict(), os.path.join(model['common']['checkpoint'], checkpoint))