def main(): opt = get_opt() print(opt) print("GMM: Start to %s, named: %s!" % (opt.stage, "GMM")) # dataset setup dataset = Dataset(opt, "GMM") dataset_loader = DataLoader(opt, dataset) model = GMM(opt) if opt.stage == 'train': if not opt.checkpoint == '' and os.path.exists(opt.checkpoint): load_checkpoint(model, opt.checkpoint) train_gmm(opt, dataset_loader, model) save_checkpoint( model, os.path.join(opt.checkpoint_dir, opt.name, 'gmm_trained.pth')) elif opt.stage == 'test': load_checkpoint(model, opt.checkpoint) with torch.no_grad(): test_gmm(opt, dataset_loader, model) else: raise NotImplementedError('Please input train or test stage') print('Finished %s stage, named: %s!' % (opt.datamode, opt.name))
def main(use_cuda: bool, data_dirs: Union[str, Sequence[str]], weights: Optional[Sequence[Number]], ckpt_root: str, latent_dim: int, num_epochs: int, batch_size: int, save: bool, resume: bool, plot: bool): device = torch.device('cuda' if use_cuda else 'cpu') if isinstance(data_dirs, str): data_dirs = [data_dirs] dataset_names = [os.path.split(data_dir)[-1] for data_dir in data_dirs] ckpt_name = spec_util.format_setup_spec('VAE', latent_dim, dataset_names) print(f"Training {ckpt_name}...") ckpt_dir = None if ckpt_root is None else os.path.join(ckpt_root, ckpt_name) train_set = data_util.get_dataset(data_dirs, weights, train=True) test_set = data_util.get_dataset(data_dirs, weights, train=False) test_batch_size = 32 dl_kwargs = dict(num_workers=1, pin_memory=True) if use_cuda else {} train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, **dl_kwargs) test_loader = DataLoader(test_set, batch_size=test_batch_size, shuffle=True, **dl_kwargs) num_batches = len(train_loader.dataset) // train_loader.batch_size model = vae.VAE(latent_dim) trainer = vae.Trainer(model, beta=4.) trainer.to(device) test_iterator = iter(test_loader) start_epoch = -1 if resume: try: start_epoch = load_checkpoint(trainer, ckpt_dir) if plot: test(model, next(test_iterator)[0]) except ValueError: print(f"No checkpoint to resume from in {ckpt_dir}") except FileNotFoundError: print(f"Invalid checkpoint directory: {ckpt_dir}") elif save: if os.path.exists(ckpt_dir): print(f"Clearing existing checkpoints in {ckpt_dir}") for filename in os.listdir(ckpt_dir): os.remove(os.path.join(ckpt_dir, filename)) for epoch in range(start_epoch + 1, num_epochs): trainer.train() for batch_idx, (data, _) in enumerate(train_loader): verbose = batch_idx % 10 == 0 if verbose: print(f"[{epoch}/{num_epochs}: {batch_idx:3d}/{num_batches:3d}] ", end='') real_data = data.to(device).unsqueeze(1).float() / 255. trainer.step(real_data, verbose) if save: save_checkpoint(trainer, ckpt_dir, epoch) if plot: test(model, next(test_iterator)[0])
def train_gmm(opt, loader, model): model.cuda() model.train() criterionL1 = nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.5, 0.999)) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: 1.0 - max(0, step - opt.max_step) / float( (opt.max_step // 2) + 1)) for step in range(opt.max_step): iter_start_time = time.time() inputs = loader.next_batch() agnostic = inputs['agnostic'].cuda() c = inputs['cloth'].cuda() im_c = inputs['parse_cloth'].cuda() grid, theta = model(agnostic, c) warped_cloth = F.grid_sample(c, grid, padding_mode='border') loss = criterionL1(warped_cloth, im_c) optimizer.zero_grad() loss.backward() optimizer.step() # Display loss update every 50 steps if (step + 1) % 50 == 0: t = time.time() - iter_start_time print('step: %8d, time: %.3f, loss: %4f' % (step + 1, t, loss.item())) # Save the model parameters every 500 steps, in case model crashes it can be resumed if (step + 1) % 500 == 0: save_checkpoint( model, os.path.join(opt.checkpoint_dir, opt.name, 'step_%06d.pth' % (step + 1)))
if arg.arch == 'vgg': input_size = 25088 model = models.vgg16(pretrained=True) elif arg.aech == 'densenet': input_size = 25088 model = models.densenet121(pretrained=True) for param in model.parameters(): param.requires_grad = False model.classifier = nn.Sequential(nn.Linear(input_size, arg.hidden_units), nn.ReLU(), nn.Dropout(0.5), nn.Linear(arg.hidden_units, 1000), nn.LogSoftmax(dim=1)) criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=arg.learning_rate) model.to(device) print('First Step: Validation') validate(model, validloader, device, criterion) print('Second Step: Training') train_classifer(model, trainloader, arg.epochs, device, optimizer, criterion, validloader) print('Third Step: Testing') classifier_test(model, device, testloader, criterion) print('Fourth Step: Saving Checkpoint') save_checkpoint(model, train_data, arg.arch, arg.epochs, arg.learning_rate, arg.hidden_units, input_size) print('Finish')
def main(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True prepare_seed(args.rand_seed) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python version : {}".format(sys.version.replace('\n', ' '))) logger.log("Pillow version : {}".format(PIL.__version__)) logger.log("PyTorch version : {}".format(torch.__version__)) logger.log("cuDNN version : {}".format(torch.backends.cudnn.version())) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max) train_transform = [transforms.PreCrop(args.pre_crop_expand)] train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))] train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)] # if args.arg_flip: # train_transform += [transforms.AugHorizontalFlip()] if args.rotate_max: train_transform += [transforms.AugRotate(args.rotate_max)] train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)] train_transform += [transforms.ToTensor(), normalize] train_transform = transforms.Compose(train_transform) eval_transform = transforms.Compose( [transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)), transforms.ToTensor(), normalize]) assert (args.scale_min + args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format( args.scale_min, args.scale_max, args.scale_eval) # Model Configure Load model_config = load_configure(args.model_config, logger) args.sigma = args.sigma * args.scale_eval logger.log('Real Sigma : {:}'.format(args.sigma)) # Training Dataset train_data = GeneralDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) train_data.load_list(args.train_lists, args.num_pts, True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True,num_workers=args.workers, pin_memory=True) # Evaluation Dataloader eval_loaders = [] if args.eval_ilists is not None: for eval_ilist in args.eval_ilists: eval_idata = GeneralDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_idata.load_list(eval_ilist, args.num_pts, True) eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_iloader, False)) # Define network logger.log('configure : {:}'.format(model_config)) net = obtain_model(model_config, args.num_pts + 1) assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format( model_config.downsample, net.downsample) logger.log("=> network :\n {}".format(net)) logger.log('Training-data : {:}'.format(train_data)) for i, eval_loader in enumerate(eval_loaders): eval_loader, is_video = eval_loader logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders), 'video' if is_video else 'image', eval_loader.dataset)) logger.log('arguments : {:}'.format(args)) opt_config = load_configure(args.opt_config, logger) if hasattr(net, 'specify_parameter'): net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay) else: net_param_dict = net.parameters() optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger) logger.log('criterion : {:}'.format(criterion)) net, criterion = net.cuda(), criterion.cuda() net = torch.nn.DataParallel(net) last_info = logger.last_info() if last_info.exists(): logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(str(last_info)) start_epoch = last_info['epoch'] + 1 checkpoint = torch.load(last_info['last_checkpoint']) assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info, checkpoint[ 'epoch']) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format(logger.last_info(), checkpoint['epoch'])) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch = 0 if args.eval_once: logger.log("=> only evaluate the model once") eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config) logger.close() return # Main Training and Evaluation Loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, opt_config.epochs): scheduler.step() need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs - epoch), True) epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs) LRs = scheduler.get_lr() logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str, need_time, min(LRs), max(LRs), opt_config)) # train for one epoch train_loss, train_nme = train(args, train_loader, net, criterion, optimizer, epoch_str, logger, opt_config) # log the results logger.log( '==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss, train_nme * 100)) # remember best prec@1 and save checkpoint save_path = save_checkpoint({ 'epoch': epoch, 'args': deepcopy(args), 'arch': model_config.arch, 'state_dict': net.state_dict(), 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, str(logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str)), logger) last_info = save_checkpoint({ 'epoch': epoch, 'last_checkpoint': save_path, }, str(logger.last_info()), logger) eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.close()
verbose=True) if opts["cuda"] == 1: # The context and seg net were already moved to_cuda([tms["score_module"], tms["lfn"]]) # Setup visualization sp = None if opts["visualize_epochs"]: import visdom import plot_voxels as pv vis = visdom.Visdom(server="http://localhost") sp = pv.SchematicPlotter(vis) run_visualization(sp, tms, dataset, opts, None, 2, 1, False) # Run training for i in range(opts["nepoch"]): train_loss, train_error = train_epoch(tms, dataloader, opts) pretty_log("train loss {:<5.4f} error {:<5.2f} {}".format( train_loss, train_error * 100, i)) if opts["checkpoint"] != "": metadata = { "epoch": i, "train_loss": train_loss, "train_error": train_error } models.save_checkpoint(tms, metadata, opts, opts["checkpoint"]) if opts["visualize_epochs"]: run_visualization(sp, tms, dataset, opts, opts["checkpoint"], 2, 1, False)
def main(args=None): if args is None: args = sys.argv[1:] args = parse_args(args) print('--------------Arguments----------------') print('data_type : ', args.data_type) print('learning_rate : ', args.learning_rate) print('validation : ', args.validation) print('epochs : ', args.epochs) print('keep_train : ', args.keep_train) print('pretrain_imagenet : ', args.pretrain_imagenet) print('train_batch_size : ', args.train_batch_size) print('val_batch_size : ', args.val_batch_size) print('dropouts : ', args.dropouts) print('weighted_loss : ', args.weighted_loss) print('---------------------------------------') mean = nml_cfg.mean std = nml_cfg.std # Make snapshot directory tools.directoryMake(path_cfg.snapshot_root_path) train_dir = os.path.join(path_cfg.data_root_path, 'train', args.data_type) train_dir = os.path.join(path_cfg.data_root_path, args.data_type) val_dir = os.path.join(path_cfg.data_root_path, 'val', args.data_type) # Make Train data_loader train_data = ds.ImageFolder( train_dir, transforms.Compose([ transforms.Resize(299), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), #transforms.RandomVerticalFlip(), transforms.ToTensor(), transforms.Normalize(mean, std) ])) num_of_class = len(os.listdir(train_dir)) train_loader = data.DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True, drop_last=False) # Make Validation data_loader if args.validation: val_data = ds.ImageFolder( val_dir, transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)])) num_of_val_class = len(os.listdir(val_dir)) val_loader = data.DataLoader(val_data, batch_size=args.val_batch_size, shuffle=False, drop_last=False) print('----------------Data-------------------') print('num_of_class : ', num_of_class) print('num_of_images : ', len(train_data)) print('---------------------------------------\n\n') class_list = train_data.classes # Make Weight weight = make_weight(train_dir, class_list, args.weighted_loss) for model_idx, model_name in enumerate(model_name_list): save_model_name = model_name + '_' + args.data_type CNN_model, CNN_optimizer, CNN_criterion, CNN_scheduler = model_setter( model_name, weight, learning_rate=args.learning_rate, output_size=num_of_class, usePretrained=args.pretrain_imagenet, dropouts=args.dropouts) if args.keep_train: CNN_model = models.load_checkpoint(CNN_model, save_model_name) best_prec = 0 for epoch in range(args.epochs): prec = train(train_loader, CNN_model, CNN_criterion, CNN_optimizer, epoch) if args.validation: prec = val(val_loader, CNN_model, CNN_criterion) # Learning rate scheduler CNN_scheduler.step() # Model weight will be saved based on it's validation performance is_best = prec > best_prec best_prec = max(prec, best_prec) models.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': CNN_model.state_dict(), 'best_prec1': best_prec, }, is_best, save_model_name) print('Best Performance : ', best_prec) print('\n\n\n')
def main(args=None): if args is None: args = sys.argv[1:] args = parse_args(args) print ('--------------Arguments----------------') print ('data_type : ', args.data_type) print ('learning_rate : ', args.learning_rate) print ('validation : ', args.validation) print ('epochs : ', args.epochs) print ('rolling_effect : ', args.rolling_effect) print ('rolling_weight_path : ', args.rolling_weight_path) print ('keep_train : ', args.keep_train) print ('pretrain_imagenet : ', args.pretrain_imagenet) print ('train_batch_size : ', args.train_batch_size) print ('val_batch_size : ', args.val_batch_size) print ('shuffle_pickle : ', args.shuffle_pickle) print ('remove_pickle : ', args.remove_pickle) print ('dropouts : ', args.dropouts) print ('---------------------------------------') mean = nml_cfg.mean std = nml_cfg.std # Make snapshot directory tools.directoryMake(path_cfg.snapshot_root_path) # Divide pickle file and make new file and set train, val path seperatly. if args.validation: train_dir, val_dir = tools.divideDataset(os.path.join(path_cfg.data_root_path, args.data_type), args.shuffle_pickle) else: train_dir = os.path.join(path_cfg.data_root_path, args.data_type) val_dir = os.path.join(path_cfg.data_root_path, args.data_type) # Make Train, Val data_loader train_data = dataload_landmark.Dataload_CNN(train_dir, path_cfg.train_val_set_dir, transforms.Compose([ transforms.Resize(299), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ToTensor(), transforms.Normalize(mean,std) ])) num_of_class = train_data.nClasses() train_loader = data.DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True,drop_last=False) print ('----------------Data-------------------') print ('num_of_class : ', num_of_class) print ('num_of_images : ', len(train_data)) print ('---------------------------------------\n\n') for model_idx, model_name in enumerate(model_name_list): if args.rolling_effect: save_model_name = model_name +'_'+ args.data_type + '_rew' else: save_model_name = model_name +'_'+ args.data_type # To apply rolling effect rw_path = os.path.join(path_cfg.snapshot_root_path, args.rolling_weight_path) if args.rolling_effect and os.path.exists(rw_path): print ('Rolling Effect is applied.') # Load model weight trained on rolling data CNN_model, CNN_optimizer, CNN_criterion, CNN_scheduler = models.rollingWeightLoader(rw_path, model_name, args.learning_rate, num_of_class, args.dropouts) else: print ('Rolling Effect is not applied.' ) # Scratch Model CNN_model, CNN_optimizer, CNN_criterion, CNN_scheduler = models.model_setter(model_name, learning_rate=args.learning_rate, output_size=num_of_class, usePretrained=args.pretrain_imagenet, dropouts=args.dropouts) print ('Scratch model') # keep training on previouse epoch. if args.keep_train: checkpoint_path = os.path.join(path_cfg.snapshot_root_path, save_model_name + '.pth.tar') if os.path.exists(checkpoint_path): print ('Keep training on previouse epoch') checkpoint = torch.load(checkpoint_path) CNN_model.load_state_dict(checkpoint['state_dict']) best_prec1 = 0 for epoch in range(args.epochs): prec_train = train(train_loader, CNN_model, CNN_criterion, CNN_optimizer, epoch) # Learning rate scheduler CNN_scheduler.step() prec1 = prec_train # Model weight will be saved based on it's validation performance is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) models.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': CNN_model.state_dict(), 'best_prec1': best_prec1, }, is_best , save_model_name) print ('Best Performance : ', best_prec1) print ('\n\n\n') if args.validation and args.remove_pickle: tools.remove_files(train_dir) tools.remove_files(val_dir)
opts=opts, collate_fxn=lambda x: torch.cat(x)) # Setup the model and optimizer model = models.ValueNet(opts) model_dict = {"value_model": model} if opts.cuda == 1: to_cuda([model, lfn]) optimizer = models.get_optim(model.parameters(), opts) # Load checkpoint if it exists if opts.checkpoint != "": models.load_checkpoint(model_dict, optimizer, opts.checkpoint, opts) else: models.check_and_print_opts(opts, None) # Run the train loop for i in range(opts.nepoch): train_loss, train_error = train_epoch(model, lfn, optimizer, train_dataloader, opts) pretty_log("train loss {:<5.4f} error {:<5.2f} {}".format( train_loss, train_error * 100, i)) if opts.checkpoint != "": metadata = { "epoch": i, "train_loss": train_loss, "train_error": train_error } models.save_checkpoint(model_dict, metadata, opts, optimizer, opts.checkpoint)