def from_name(cls, model_name, override_params=None): cls._check_model_name_is_valid(model_name) blocks_args, global_params = get_model_params(model_name, override_params) return cls(blocks_args, global_params)
def train(args, checkpoint, mid_checkpoint_location, final_checkpoint_location, best_checkpoint_location, actfun, curr_seed, outfile_path, filename, fieldnames, curr_sample_size, device, num_params, curr_k=2, curr_p=1, curr_g=1, perm_method='shuffle'): """ Runs training session for a given randomized model :param args: arguments for this job :param checkpoint: current checkpoint :param checkpoint_location: output directory for checkpoints :param actfun: activation function currently being used :param curr_seed: seed being used by current job :param outfile_path: path to save outputs from training session :param fieldnames: column names for output file :param device: reference to CUDA device for GPU support :param num_params: number of parameters in the network :param curr_k: k value for this iteration :param curr_p: p value for this iteration :param curr_g: g value for this iteration :param perm_method: permutation strategy for our network :return: """ resnet_ver = args.resnet_ver resnet_width = args.resnet_width num_epochs = args.num_epochs actfuns_1d = ['relu', 'abs', 'swish', 'leaky_relu', 'tanh'] if actfun in actfuns_1d: curr_k = 1 kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {} if args.one_shot: util.seed_all(curr_seed) model_temp, _ = load_model(args.model, args.dataset, actfun, curr_k, curr_p, curr_g, num_params=num_params, perm_method=perm_method, device=device, resnet_ver=resnet_ver, resnet_width=resnet_width, verbose=args.verbose) util.seed_all(curr_seed) dataset_temp = util.load_dataset( args, args.model, args.dataset, seed=curr_seed, validation=True, batch_size=args.batch_size, train_sample_size=curr_sample_size, kwargs=kwargs) curr_hparams = hparams.get_hparams(args.model, args.dataset, actfun, curr_seed, num_epochs, args.search, args.hp_idx, args.one_shot) optimizer = optim.Adam(model_temp.parameters(), betas=(curr_hparams['beta1'], curr_hparams['beta2']), eps=curr_hparams['eps'], weight_decay=curr_hparams['wd'] ) start_time = time.time() oneshot_fieldnames = fieldnames if args.search else None oneshot_outfile_path = outfile_path if args.search else None lr = util.run_lr_finder( args, model_temp, dataset_temp[0], optimizer, nn.CrossEntropyLoss(), val_loader=dataset_temp[3], show=False, device=device, fieldnames=oneshot_fieldnames, outfile_path=oneshot_outfile_path, hparams=curr_hparams ) curr_hparams = {} print("Time to find LR: {}\n LR found: {:3e}".format(time.time() - start_time, lr)) else: curr_hparams = hparams.get_hparams(args.model, args.dataset, actfun, curr_seed, num_epochs, args.search, args.hp_idx) lr = curr_hparams['max_lr'] criterion = nn.CrossEntropyLoss() model, model_params = load_model(args.model, args.dataset, actfun, curr_k, curr_p, curr_g, num_params=num_params, perm_method=perm_method, device=device, resnet_ver=resnet_ver, resnet_width=resnet_width, verbose=args.verbose) util.seed_all(curr_seed) model.apply(util.weights_init) util.seed_all(curr_seed) dataset = util.load_dataset( args, args.model, args.dataset, seed=curr_seed, validation=args.validation, batch_size=args.batch_size, train_sample_size=curr_sample_size, kwargs=kwargs) loaders = { 'aug_train': dataset[0], 'train': dataset[1], 'aug_eval': dataset[2], 'eval': dataset[3], } sample_size = dataset[4] batch_size = dataset[5] if args.one_shot: optimizer = optim.Adam(model_params) scheduler = OneCycleLR(optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=int(math.floor(sample_size / batch_size)), cycle_momentum=False ) else: optimizer = optim.Adam(model_params, betas=(curr_hparams['beta1'], curr_hparams['beta2']), eps=curr_hparams['eps'], weight_decay=curr_hparams['wd'] ) scheduler = OneCycleLR(optimizer, max_lr=curr_hparams['max_lr'], epochs=num_epochs, steps_per_epoch=int(math.floor(sample_size / batch_size)), pct_start=curr_hparams['cycle_peak'], cycle_momentum=False ) epoch = 1 if checkpoint is not None: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) epoch = checkpoint['epoch'] model.to(device) print("*** LOADED CHECKPOINT ***" "\n{}" "\nSeed: {}" "\nEpoch: {}" "\nActfun: {}" "\nNum Params: {}" "\nSample Size: {}" "\np: {}" "\nk: {}" "\ng: {}" "\nperm_method: {}".format(mid_checkpoint_location, checkpoint['curr_seed'], checkpoint['epoch'], checkpoint['actfun'], checkpoint['num_params'], checkpoint['sample_size'], checkpoint['p'], checkpoint['k'], checkpoint['g'], checkpoint['perm_method'])) util.print_exp_settings(curr_seed, args.dataset, outfile_path, args.model, actfun, util.get_model_params(model), sample_size, batch_size, model.k, model.p, model.g, perm_method, resnet_ver, resnet_width, args.optim, args.validation, curr_hparams) best_val_acc = 0 if args.mix_pre_apex: model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # ---- Start Training while epoch <= num_epochs: if args.check_path != '': torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'curr_seed': curr_seed, 'epoch': epoch, 'actfun': actfun, 'num_params': num_params, 'sample_size': sample_size, 'p': curr_p, 'k': curr_k, 'g': curr_g, 'perm_method': perm_method }, mid_checkpoint_location) util.seed_all((curr_seed * args.num_epochs) + epoch) start_time = time.time() if args.mix_pre: scaler = torch.cuda.amp.GradScaler() # ---- Training model.train() total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loaders['aug_train']): # print(batch_idx) x, targetx = x.to(device), targetx.to(device) optimizer.zero_grad() if args.mix_pre: with torch.cuda.amp.autocast(): output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 scaler.scale(train_loss).backward() scaler.step(optimizer) scaler.update() elif args.mix_pre_apex: output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() else: output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 train_loss.backward() optimizer.step() if args.optim == 'onecycle' or args.optim == 'onecycle_sgd': scheduler.step() _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_aug_train_loss = total_train_loss / n epoch_aug_train_acc = num_correct * 1.0 / num_total alpha_primes = [] alphas = [] if model.actfun == 'combinact': for i, layer_alpha_primes in enumerate(model.all_alpha_primes): curr_alpha_primes = torch.mean(layer_alpha_primes, dim=0) curr_alphas = F.softmax(curr_alpha_primes, dim=0).data.tolist() curr_alpha_primes = curr_alpha_primes.tolist() alpha_primes.append(curr_alpha_primes) alphas.append(curr_alphas) model.eval() with torch.no_grad(): total_val_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (y, targety) in enumerate(loaders['aug_eval']): y, targety = y.to(device), targety.to(device) output = model(y) val_loss = criterion(output, targety) total_val_loss += val_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targety.data) num_total += len(prediction) epoch_aug_val_loss = total_val_loss / n epoch_aug_val_acc = num_correct * 1.0 / num_total total_val_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (y, targety) in enumerate(loaders['eval']): y, targety = y.to(device), targety.to(device) output = model(y) val_loss = criterion(output, targety) total_val_loss += val_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targety.data) num_total += len(prediction) epoch_val_loss = total_val_loss / n epoch_val_acc = num_correct * 1.0 / num_total lr_curr = 0 for param_group in optimizer.param_groups: lr_curr = param_group['lr'] print( " Epoch {}: LR {:1.5f} ||| aug_train_acc {:1.4f} | val_acc {:1.4f}, aug {:1.4f} ||| " "aug_train_loss {:1.4f} | val_loss {:1.4f}, aug {:1.4f} ||| time = {:1.4f}" .format(epoch, lr_curr, epoch_aug_train_acc, epoch_val_acc, epoch_aug_val_acc, epoch_aug_train_loss, epoch_val_loss, epoch_aug_val_loss, (time.time() - start_time)), flush=True ) if args.hp_idx is None: hp_idx = -1 else: hp_idx = args.hp_idx epoch_train_loss = 0 epoch_train_acc = 0 if epoch == num_epochs: with torch.no_grad(): total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loaders['aug_train']): x, targetx = x.to(device), targetx.to(device) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_aug_train_loss = total_train_loss / n epoch_aug_train_acc = num_correct * 1.0 / num_total total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loaders['train']): x, targetx = x.to(device), targetx.to(device) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_train_loss = total_val_loss / n epoch_train_acc = num_correct * 1.0 / num_total # Outputting data to CSV at end of epoch with open(outfile_path, mode='a') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames, lineterminator='\n') writer.writerow({'dataset': args.dataset, 'seed': curr_seed, 'epoch': epoch, 'time': (time.time() - start_time), 'actfun': model.actfun, 'sample_size': sample_size, 'model': args.model, 'batch_size': batch_size, 'alpha_primes': alpha_primes, 'alphas': alphas, 'num_params': util.get_model_params(model), 'var_nparams': args.var_n_params, 'var_nsamples': args.var_n_samples, 'k': curr_k, 'p': curr_p, 'g': curr_g, 'perm_method': perm_method, 'gen_gap': float(epoch_val_loss - epoch_train_loss), 'aug_gen_gap': float(epoch_aug_val_loss - epoch_aug_train_loss), 'resnet_ver': resnet_ver, 'resnet_width': resnet_width, 'epoch_train_loss': float(epoch_train_loss), 'epoch_train_acc': float(epoch_train_acc), 'epoch_aug_train_loss': float(epoch_aug_train_loss), 'epoch_aug_train_acc': float(epoch_aug_train_acc), 'epoch_val_loss': float(epoch_val_loss), 'epoch_val_acc': float(epoch_val_acc), 'epoch_aug_val_loss': float(epoch_aug_val_loss), 'epoch_aug_val_acc': float(epoch_aug_val_acc), 'hp_idx': hp_idx, 'curr_lr': lr_curr, 'found_lr': lr, 'hparams': curr_hparams, 'epochs': num_epochs }) epoch += 1 if args.optim == 'rmsprop': scheduler.step() if args.checkpoints: if epoch_val_acc > best_val_acc: best_val_acc = epoch_val_acc torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'curr_seed': curr_seed, 'epoch': epoch, 'actfun': actfun, 'num_params': num_params, 'sample_size': sample_size, 'p': curr_p, 'k': curr_k, 'g': curr_g, 'perm_method': perm_method }, best_checkpoint_location) torch.save({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'curr_seed': curr_seed, 'epoch': epoch, 'actfun': actfun, 'num_params': num_params, 'sample_size': sample_size, 'p': curr_p, 'k': curr_k, 'g': curr_g, 'perm_method': perm_method }, final_checkpoint_location)
def train_network(model): n_channels, depth, z_dim, n_hid_first, lam, L = get_model_params(model) batch_size, num_epochs, learning_rate = get_training_params(model) data = TrainData(batch_size) input_var = T.matrix('inputs') # Create VAE model l_z_mean, l_z_logsigma, l_x_mean_list, l_x_logsigma_list, l_x_list, l_x = \ build_vae(input_var, n_channels=n_channels, depth=depth, z_dim=z_dim, n_hid_first=n_hid_first, L=L) def build_loss(deterministic): layer_outputs = nn.layers.get_output([l_z_mean, l_z_logsigma] + l_x_mean_list + l_x_logsigma_list, deterministic=deterministic) z_mean = layer_outputs[0] z_ls = layer_outputs[1] x_mean = layer_outputs[2: 2 + L] x_logsigma = layer_outputs[2 + L : 2 + 2 * L] # Loss function: - log p(x|z) + KL_div kl_div = lam * 0.5 * T.sum(T.exp(2 * z_ls) + T.sqr(z_mean) - 1 - 2 * z_ls) logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls) for mu, ls in zip(x_mean, x_logsigma)) / L prediction = x_mean[0] if deterministic else T.sum(x_mean, axis=0) / L loss = -logpxz + kl_div return loss, prediction loss, _ = build_loss(deterministic=False) test_loss, test_prediction = build_loss(deterministic=True) # ADAM updates params = nn.layers.get_all_params(l_x, trainable=True) updates = nn.updates.adam(loss, params, learning_rate=learning_rate) train_fn = theano.function([input_var], loss, updates=updates) val_fn = theano.function([input_var], test_loss) previous_val_err_1 = float('inf') previous_val_err_2 = float('inf') for epoch in range(num_epochs): train_err = 0.0 epoch_size = 0 start_time = time.time() for i in range(data.train_size): batch = data.next_batch() this_err = train_fn(batch) train_err += this_err epoch_size += batch.shape[0] print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print("training loss: {:.6f}".format(train_err / epoch_size)) val_err = 0.0 val_size = 0 test_data = data.validation_data() for i in range(data.validation_size): err = val_fn(test_data[i]) val_err += err val_size += test_data[i].shape[0] print("validation loss: {:.6f}".format(val_err / val_size)) # early stopping if val_err > previous_val_err_1 and val_err > previous_val_err_2: break else: previous_val_err_2 = previous_val_err_1 previous_val_err_1 = val_err # save the parameters so they can be loaded for next time np.savez(model_path(model) + str(epoch), *nn.layers.get_all_param_values(l_x)) # output samples samples = data.validation_samples() pred_fn = theano.function([input_var], test_prediction) X_pred = pred_fn(samples) for i in range(len(samples)): print(samples[i] - X_pred[i])
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") _logger.info('====================\n\n' 'Actfun: {}\n' 'LR: {}\n' 'Epochs: {}\n' 'p: {}\n' 'k: {}\n' 'g: {}\n' 'Extra channel multiplier: {}\n' 'Weight Init: {}\n' '\n===================='.format(args.actfun, args.lr, args.epochs, args.p, args.k, args.g, args.extra_channel_mult, args.weight_init)) # ================================================================================= Loading models pre_model = create_model( args.model, pretrained=True, actfun='swish', num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, scriptable=args.torchscript, checkpoint_path=args.initial_checkpoint, p=args.p, k=args.k, g=args.g, extra_channel_mult=args.extra_channel_mult, weight_init_name=args.weight_init, partial_ho_actfun=args.partial_ho_actfun) pre_model_layers = list(pre_model.children()) pre_model = torch.nn.Sequential(*pre_model_layers[:-1]) pre_model.to(device) model = MLP.MLP(actfun=args.actfun, input_dim=1280, output_dim=args.num_classes, k=args.k, p=args.p, g=args.g, num_params=1_000_000, permute_type='shuffle') model.to(device) # ================================================================================= Loading dataset util.seed_all(args.seed) if args.data == 'caltech101' and not os.path.exists('caltech101'): dir_root = r'101_ObjectCategories' dir_new = r'caltech101' dir_new_train = os.path.join(dir_new, 'train') dir_new_val = os.path.join(dir_new, 'val') dir_new_test = os.path.join(dir_new, 'test') if not os.path.exists(dir_new): os.mkdir(dir_new) os.mkdir(dir_new_train) os.mkdir(dir_new_val) os.mkdir(dir_new_test) for dir2 in os.listdir(dir_root): if dir2 != 'BACKGROUND_Google': curr_path = os.path.join(dir_root, dir2) new_path_train = os.path.join(dir_new_train, dir2) new_path_val = os.path.join(dir_new_val, dir2) new_path_test = os.path.join(dir_new_test, dir2) if not os.path.exists(new_path_train): os.mkdir(new_path_train) if not os.path.exists(new_path_val): os.mkdir(new_path_val) if not os.path.exists(new_path_test): os.mkdir(new_path_test) train_upper = int(0.8 * len(os.listdir(curr_path))) val_upper = int(0.9 * len(os.listdir(curr_path))) curr_files_all = os.listdir(curr_path) curr_files_train = curr_files_all[:train_upper] curr_files_val = curr_files_all[train_upper:val_upper] curr_files_test = curr_files_all[val_upper:] for file in curr_files_train: copyfile(os.path.join(curr_path, file), os.path.join(new_path_train, file)) for file in curr_files_val: copyfile(os.path.join(curr_path, file), os.path.join(new_path_val, file)) for file in curr_files_test: copyfile(os.path.join(curr_path, file), os.path.join(new_path_test, file)) time.sleep(5) # create the train and eval datasets train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): _logger.error( 'Training folder does not exist at: {}'.format(train_dir)) exit(1) dataset_train = Dataset(train_dir) eval_dir = os.path.join(args.data, 'val') if not os.path.isdir(eval_dir): eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): _logger.error( 'Validation folder does not exist at: {}'.format(eval_dir)) exit(1) dataset_eval = Dataset(eval_dir) # setup augmentation batch splits for contrastive loss or split bn num_aug_splits = 0 if args.aug_splits > 0: assert args.aug_splits > 1, 'A split of 1 makes no sense' num_aug_splits = args.aug_splits # enable split bn (separate bn stats per batch-portion) if args.split_bn: assert num_aug_splits > 1 or args.resplit model = convert_splitbn_model(model, max(num_aug_splits, 2)) # setup mixup / cutmix collate_fn = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_args = dict(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.num_classes) if args.prefetcher: assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) collate_fn = FastCollateMixup(**mixup_args) else: mixup_fn = Mixup(**mixup_args) # create data loaders w/ augmentation pipeline train_interpolation = args.train_interpolation data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) if args.no_aug or not train_interpolation: train_interpolation = data_config['interpolation'] loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, no_aug=args.no_aug, re_prob=args.reprob, re_mode=args.remode, re_count=args.recount, re_split=args.resplit, scale=args.scale, ratio=args.ratio, hflip=args.hflip, vflip=args.vflip, color_jitter=args.color_jitter, auto_augment=args.aa, num_aug_splits=num_aug_splits, interpolation=train_interpolation, mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, pin_memory=args.pin_mem, use_multi_epochs_loader=args.use_multi_epochs_loader) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, crop_pct=data_config['crop_pct'], pin_memory=args.pin_mem, ) # ================================================================================= Optimizer / scheduler criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5) scheduler = OneCycleLR( optimizer, max_lr=args.lr, epochs=args.epochs, steps_per_epoch=int(math.floor(len(dataset_train) / args.batch_size)), cycle_momentum=False) # ================================================================================= Save file / checkpoints fieldnames = [ 'dataset', 'seed', 'epoch', 'time', 'actfun', 'model', 'batch_size', 'alpha_primes', 'alphas', 'num_params', 'k', 'p', 'g', 'perm_method', 'gen_gap', 'epoch_train_loss', 'epoch_train_acc', 'epoch_aug_train_loss', 'epoch_aug_train_acc', 'epoch_val_loss', 'epoch_val_acc', 'curr_lr', 'found_lr', 'epochs' ] filename = 'out_{}_{}_{}_{}'.format(datetime.date.today(), args.actfun, args.data, args.seed) outfile_path = os.path.join(args.output, filename) + '.csv' checkpoint_path = os.path.join(args.check_path, filename) + '.pth' if not os.path.exists(outfile_path): with open(outfile_path, mode='w') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames, lineterminator='\n') writer.writeheader() epoch = 1 checkpoint = torch.load(checkpoint_path) if os.path.exists( checkpoint_path) else None if checkpoint is not None: pre_model.load_state_dict(checkpoint['pre_model_state_dict']) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) epoch = checkpoint['epoch'] pre_model.to(device) model.to(device) print("*** LOADED CHECKPOINT ***" "\n{}" "\nSeed: {}" "\nEpoch: {}" "\nActfun: {}" "\np: {}" "\nk: {}" "\ng: {}" "\nperm_method: {}".format(checkpoint_path, checkpoint['curr_seed'], checkpoint['epoch'], checkpoint['actfun'], checkpoint['p'], checkpoint['k'], checkpoint['g'], checkpoint['perm_method'])) args.mix_pre_apex = False if args.control_amp == 'apex': args.mix_pre_apex = True model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # ================================================================================= Training while epoch <= args.epochs: if args.check_path != '': torch.save( { 'pre_model_state_dict': pre_model.state_dict(), 'model_state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'curr_seed': args.seed, 'epoch': epoch, 'actfun': args.actfun, 'p': args.p, 'k': args.k, 'g': args.g, 'perm_method': 'shuffle' }, checkpoint_path) util.seed_all((args.seed * args.epochs) + epoch) start_time = time.time() args.mix_pre = False if args.control_amp == 'native': args.mix_pre = True scaler = torch.cuda.amp.GradScaler() # ---- Training model.train() total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loader_train): x, targetx = x.to(device), targetx.to(device) optimizer.zero_grad() if args.mix_pre: with torch.cuda.amp.autocast(): with torch.no_grad(): x = pre_model(x) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 scaler.scale(train_loss).backward() scaler.step(optimizer) scaler.update() elif args.mix_pre_apex: with torch.no_grad(): x = pre_model(x) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() else: with torch.no_grad(): x = pre_model(x) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 train_loss.backward() optimizer.step() scheduler.step() _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_aug_train_loss = total_train_loss / n epoch_aug_train_acc = num_correct * 1.0 / num_total alpha_primes = [] alphas = [] if model.actfun == 'combinact': for i, layer_alpha_primes in enumerate(model.all_alpha_primes): curr_alpha_primes = torch.mean(layer_alpha_primes, dim=0) curr_alphas = F.softmax(curr_alpha_primes, dim=0).data.tolist() curr_alpha_primes = curr_alpha_primes.tolist() alpha_primes.append(curr_alpha_primes) alphas.append(curr_alphas) model.eval() with torch.no_grad(): total_val_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (y, targety) in enumerate(loader_eval): y, targety = y.to(device), targety.to(device) with torch.no_grad(): y = pre_model(y) output = model(y) val_loss = criterion(output, targety) total_val_loss += val_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targety.data) num_total += len(prediction) epoch_val_loss = total_val_loss / n epoch_val_acc = num_correct * 1.0 / num_total lr_curr = 0 for param_group in optimizer.param_groups: lr_curr = param_group['lr'] print( " Epoch {}: LR {:1.5f} ||| aug_train_acc {:1.4f} | val_acc {:1.4f} ||| " "aug_train_loss {:1.4f} | val_loss {:1.4f} ||| time = {:1.4f}". format(epoch, lr_curr, epoch_aug_train_acc, epoch_val_acc, epoch_aug_train_loss, epoch_val_loss, (time.time() - start_time)), flush=True) epoch_train_loss = 0 epoch_train_acc = 0 if epoch == args.epochs: with torch.no_grad(): total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loader_train): x, targetx = x.to(device), targetx.to(device) with torch.no_grad(): x = pre_model(x) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_aug_train_loss = total_train_loss / n epoch_aug_train_acc = num_correct * 1.0 / num_total total_train_loss, n, num_correct, num_total = 0, 0, 0, 0 for batch_idx, (x, targetx) in enumerate(loader_eval): x, targetx = x.to(device), targetx.to(device) with torch.no_grad(): x = pre_model(x) output = model(x) train_loss = criterion(output, targetx) total_train_loss += train_loss n += 1 _, prediction = torch.max(output.data, 1) num_correct += torch.sum(prediction == targetx.data) num_total += len(prediction) epoch_train_loss = total_val_loss / n epoch_train_acc = num_correct * 1.0 / num_total # Outputting data to CSV at end of epoch with open(outfile_path, mode='a') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames, lineterminator='\n') writer.writerow({ 'dataset': args.data, 'seed': args.seed, 'epoch': epoch, 'time': (time.time() - start_time), 'actfun': model.actfun, 'model': args.model, 'batch_size': args.batch_size, 'alpha_primes': alpha_primes, 'alphas': alphas, 'num_params': util.get_model_params(model), 'k': args.k, 'p': args.p, 'g': args.g, 'perm_method': 'shuffle', 'gen_gap': float(epoch_val_loss - epoch_train_loss), 'epoch_train_loss': float(epoch_train_loss), 'epoch_train_acc': float(epoch_train_acc), 'epoch_aug_train_loss': float(epoch_aug_train_loss), 'epoch_aug_train_acc': float(epoch_aug_train_acc), 'epoch_val_loss': float(epoch_val_loss), 'epoch_val_acc': float(epoch_val_acc), 'curr_lr': lr_curr, 'found_lr': args.lr, 'epochs': args.epochs }) epoch += 1
def _load_model(self, path): self._model = keras.models.load_model(path, compile=False, custom_objects={'Interp': Interp, 'relu6': relu6}) (self.height, self.width, self.channels), _, self.sine_steering = get_model_params(self._model) self._model.summary() x = self._model.layers[-7].get_output_at(0) self.cut_model = Model(inputs=self._model.layers[-7].get_input_at(0), outputs=x)
def _load_model(self, path: str): # Uncomment for legacy models # model: tensorflow.python.keras.Model = get_hegemax_model(1, True) # model.load_weights(path) # self._model = model self._model = keras.models.load_model(path, compile=False, custom_objects={'Interp': Interp, 'relu6': relu6}) (self.height, self.width, self.channels), self.sequence_length, self.sine_steering = get_model_params( self._model) print("Image shape: " + str((self.height, self.width, self.channels)) + ", sequence length: " + str( self.sequence_length) + ", sine steering? " + str(self.sine_steering))
with open(folder_name + '/params.json', 'w') as outfile: json.dump(test_params, outfile) save_predictions(folder_name, x, gt, probs, processor) def save_predictions(path, x, gt, probs, processor): np.save(path + '/x', x) np.save(path + '/gt', gt) np.save(path + '/probs', probs) pickle.dump(processor, open(path + '/processor', 'w+')) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("test_config_file", help="path to config file") parser.add_argument( 'model_paths', nargs='+', help="path to models") parser.add_argument("--split", help="train/val", choices=['train', 'test'], default='test') args = parser.parse_args() train_params = util.get_model_params(args.model_paths[0]) # FIXME: bug test_params = train_params.copy() test_new_params = json.load(open(args.test_config_file, 'r')) test_params.update(test_new_params) if "label_review" in test_new_params["data_path"]: assert(args.split == 'test') predict(args, train_params, test_params)