def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='../data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='../data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() if args.opt_method == "sgd": print("using SGD") optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=False) elif args.opt_method == "adam": print("using Adam") optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise Exception("Optimizer not supported") # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join( args.checkpoint, str(args.dataset) + '_' + str(args.arch) + '_mom_lr=' + str(args.lr) + '_m=' + str(args.momentum) + '_drop:' + str(args.drop) + '_seed=' + str(args.manualSeed) + '.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return flag3 = 0 # Train and val for epoch in range(start_epoch, args.epochs): if args.opt_method == "sgd": adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) if flag3 == 0: logger.append([0, train_init_loss, test_init_loss, 0, 0]) flag3 = 1 # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) logger.close() print('Best acc:') print(best_acc)
def main(): ''' set default hyperparams in default_hyperparams.py ''' parser = argparse.ArgumentParser() # Required arguments parser.add_argument('-d', '--dataset', choices=supported.datasets, required=True) parser.add_argument('--algorithm', required=True, choices=supported.algorithms) parser.add_argument( '--root_dir', required=True, help= 'The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).' ) parser.add_argument('--analyze_sample', default=1) # Dataset parser.add_argument( '--split_scheme', help= 'Identifies how the train/val/test split is constructed. Choices are dataset-specific.' ) parser.add_argument('--dataset_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument( '--download', default=False, type=parse_bool, const=True, nargs='?', help= 'If true, tries to downloads the dataset if it does not exist in root_dir.' ) parser.add_argument( '--frac', type=float, default=1.0, help= 'Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes.' ) # Loaders parser.add_argument('--loader_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument('--train_loader', choices=['standard', 'group']) parser.add_argument('--uniform_over_groups', type=parse_bool, const=True, nargs='?') parser.add_argument('--distinct_groups', type=parse_bool, const=True, nargs='?') parser.add_argument('--n_groups_per_batch', type=int) parser.add_argument('--batch_size', type=int) parser.add_argument('--eval_loader', choices=['standard'], default='standard') # Model parser.add_argument('--model', choices=supported.models) parser.add_argument( '--model_kwargs', nargs='*', action=ParseKwargs, default={}, help= 'keyword arguments for model initialization passed as key1=value1 key2=value2' ) # Transforms parser.add_argument('--train_transform', choices=supported.transforms) parser.add_argument('--eval_transform', choices=supported.transforms) parser.add_argument( '--target_resolution', nargs='+', type=int, help= 'target resolution. for example --target_resolution 224 224 for standard resnet.' ) parser.add_argument('--resize_scale', type=float) parser.add_argument('--max_token_length', type=int) # Objective parser.add_argument('--loss_function', choices=supported.losses) # Algorithm parser.add_argument('--groupby_fields', nargs='+') parser.add_argument('--group_dro_step_size', type=float) parser.add_argument('--coral_penalty_weight', type=float) parser.add_argument('--irm_lambda', type=float) parser.add_argument('--irm_penalty_anneal_iters', type=int) parser.add_argument('--algo_log_metric') parser.add_argument('--hsic_beta', type=float) parser.add_argument('--grad_penalty_lamb', type=float) parser.add_argument( '--params_regex', type=str, help='Regular expression specifying which gradients to penalize.') parser.add_argument('--label_cond', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--dann_lamb', type=float) parser.add_argument('--dann_dc_name', type=str) # Model selection parser.add_argument('--val_metric') parser.add_argument('--val_metric_decreasing', type=parse_bool, const=True, nargs='?') # Optimization parser.add_argument('--n_epochs', type=int) parser.add_argument('--optimizer', choices=supported.optimizers) parser.add_argument('--lr', type=float) parser.add_argument('--weight_decay', type=float) parser.add_argument('--max_grad_norm', type=float) parser.add_argument('--optimizer_kwargs', nargs='*', action=ParseKwargs, default={}) # Scheduler parser.add_argument('--scheduler', choices=supported.schedulers) parser.add_argument('--scheduler_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument('--scheduler_metric_split', choices=['train', 'val'], default='val') parser.add_argument('--scheduler_metric_name') # Evaluation parser.add_argument('--evaluate_all_splits', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--eval_splits', nargs='+', default=[]) parser.add_argument('--eval_only', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--eval_epoch', default=None, type=int) parser.add_argument('--save_z', type=parse_bool, const=True, nargs='?', default=False) # Misc parser.add_argument('--device', type=int, default=0) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log_dir', default='./logs') parser.add_argument('--log_every', default=50, type=int) parser.add_argument('--save_step', type=int) parser.add_argument('--save_best', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--save_last', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--no_group_logging', type=parse_bool, const=True, nargs='?') parser.add_argument('--use_wandb', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--progress_bar', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--resume', type=parse_bool, const=True, nargs='?', default=False) config = parser.parse_args() config = populate_defaults(config) # set device config.device = torch.device("cuda:" + str( config.device)) if torch.cuda.is_available() else torch.device("cpu") ## Initialize logs if os.path.exists(config.log_dir) and config.resume: resume = True mode = 'a' elif os.path.exists(config.log_dir) and config.eval_only: resume = False mode = 'a' else: resume = False mode = 'w' if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) logger = Logger(os.path.join(config.log_dir, 'log.txt'), mode) # Record config log_config(config, logger) # Set random seed set_seed(config.seed) # Data full_dataset = supported.datasets[config.dataset]( root_dir=config.root_dir, download=config.download, split_scheme=config.split_scheme, **config.dataset_kwargs) # To implement data augmentation (i.e., have different transforms # at training time vs. test time), modify these two lines: train_transform = initialize_transform( transform_name=config.train_transform, config=config, dataset=full_dataset) eval_transform = initialize_transform(transform_name=config.eval_transform, config=config, dataset=full_dataset) train_grouper = CombinatorialGrouper(dataset=full_dataset, groupby_fields=config.groupby_fields) datasets = defaultdict(dict) for split in full_dataset.split_dict.keys(): if split == 'train': transform = train_transform verbose = True elif split == 'val': transform = eval_transform verbose = True else: transform = eval_transform verbose = False # Get subset datasets[split]['dataset'] = full_dataset.get_subset( split, frac=config.frac, transform=transform) if split == 'train': datasets[split]['loader'] = get_train_loader( loader=config.train_loader, dataset=datasets[split]['dataset'], batch_size=config.batch_size, uniform_over_groups=config.uniform_over_groups, grouper=train_grouper, distinct_groups=config.distinct_groups, n_groups_per_batch=config.n_groups_per_batch, **config.loader_kwargs) else: datasets[split]['loader'] = get_eval_loader( loader=config.eval_loader, dataset=datasets[split]['dataset'], grouper=train_grouper, batch_size=config.batch_size, **config.loader_kwargs) # Set fields datasets[split]['split'] = split datasets[split]['name'] = full_dataset.split_names[split] datasets[split]['verbose'] = verbose # Loggers datasets[split]['eval_logger'] = BatchLogger( os.path.join(config.log_dir, f'{split}_eval.csv'), mode=mode, use_wandb=(config.use_wandb and verbose)) datasets[split]['algo_logger'] = BatchLogger( os.path.join(config.log_dir, f'{split}_algo.csv'), mode=mode, use_wandb=(config.use_wandb and verbose)) if config.use_wandb: initialize_wandb(config) # Logging dataset info if config.no_group_logging and full_dataset.is_classification and full_dataset.y_size == 1: log_grouper = CombinatorialGrouper(dataset=full_dataset, groupby_fields=['y']) elif config.no_group_logging: log_grouper = None else: log_grouper = train_grouper log_group_data(datasets, log_grouper, logger) ## Initialize algorithm algorithm = initialize_algorithm(config=config, datasets=datasets, train_grouper=train_grouper) if config.eval_epoch is None: eval_model_path = os.path.join(config.log_dir, 'best_model.pth') else: eval_model_path = os.path.join(config.log_dir, f'{config.eval_epoch}_model.pth') best_epoch, best_val_metric = load(algorithm, eval_model_path) if config.eval_epoch is None: epoch = best_epoch else: epoch = config.eval_epoch results, z_splits, y_splits, c_splits = evaluate(algorithm=algorithm, datasets=datasets, epoch=epoch, general_logger=logger, config=config) include_test = config.evaluate_all_splits or 'test' in config.eval_splits logistics = all_logistics(z_splits, c_splits, y_splits, epoch=epoch, sample=int(config.analyze_sample), include_test=include_test) logistics['G0'] = results['id_val']['acc_avg'] logistics['G1'] = logistics['val_on_val'] logistics['G2'] = logistics['trainval_on_val'] logistics['G3'] = results['val']['acc_avg'] logistics['I0'] = logistics['c_train'] logistics['I1'] = logistics['c_val'] per_class = torch.tensor(list(logistics['c_perclass'].values())) logistics['I2'] = torch.mean(per_class).item() if include_test: logistics['G1_test'] = logistics['test_on_test'] logistics['G2_test'] = logistics['traintest_on_test'] logistics['G3_test'] = results['test']['acc_avg'] logistics['I1_test'] = logistics['c_test'] per_class = torch.tensor(list(logistics['c_perclass_test'].values())) logistics['I2_test'] = torch.mean(per_class).item() with (open(os.path.join(config.log_dir, f'tests_epoch_{epoch}.pkl'), "wb")) as f: pickle.dump(logistics, f) logger.close() for split in datasets: datasets[split]['eval_logger'].close() datasets[split]['algo_logger'].close()
def main(): global best_acc global mean global std if not os.path.isdir(args.out): mkdir_p(args.out) # Data print(f'==> Preparing AFLW') transform_train = transforms.Compose([ transforms.Resize((60,60)), ]) transform_val = transforms.Compose([ transforms.Resize((60,60)), ]) train_labeled_set, train_unlabeled_set, stat_labeled_set, train_val_set, val_set, test_set, mean, std = dataset.get_alfw(args.data_root, args.n_labeled, transform_train, transform_val) num_samples = args.val_iteration * args.batch_size sampler_x = RandomSampler(train_labeled_set, replacement=True, num_samples=num_samples) batch_sampler_x = BatchSampler(sampler_x, args.batch_size, drop_last=True) labeled_trainloader = data.DataLoader(train_labeled_set, batch_sampler=batch_sampler_x, num_workers=16, pin_memory=True) sampler_u = RandomSampler(train_unlabeled_set, replacement=True, num_samples=num_samples) batch_sampler_u = BatchSampler(sampler_u, args.batch_size, drop_last=True) unlabeled_trainloader = data.DataLoader(train_unlabeled_set, batch_sampler=batch_sampler_u, num_workers=16, pin_memory=True) val_loader = data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True) test_loader = data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True) sampler_trainval = RandomSampler(train_labeled_set, replacement=True, num_samples=num_samples) batch_sampler_trainval = BatchSampler(sampler_trainval, args.batch_size, drop_last=True) train_val_loader = data.DataLoader(train_labeled_set, batch_sampler=batch_sampler_trainval, num_workers=16, pin_memory=True) args.val_iteration=len(unlabeled_trainloader) # Model print("==> creating TCDCN") def create_model(ema=False): model = models.TCDCNN() model = model.cuda() if ema: for param in model.parameters(): param.detach_() return model model = create_model() ema_model = create_model(ema=True) target_model = create_model(ema=True) cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) train_criterion = nn.MSELoss() criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) ema_optimizer= WeightEMA(model, ema_model, alpha=args.ema_decay) target_optimizer = UpdateEma(model, target_model, alpha=args.ema_decay) start_epoch = 0 rampup_length = float(args.epochs) / float(args.batch_size) # Resume title = 'celeba-mtfl' if args.resume: # Load. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.out = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) ema_model.load_state_dict(checkpoint['ema_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.resume, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.out, 'log.txt'), title=title) logger.set_names(['Train Loss', 'Train Loss X', 'Train Loss U', 'Train ME', 'Train FR' , 'Valid Loss', 'Valid ME.', 'Valid FR', 'Test Loss', 'Test ME.', 'Test FR']) writer = SummaryWriter(args.out) step = 0 test_accs = [] # Train and val for epoch in range(start_epoch, args.epochs): if (epoch + 1) % 100 == 0 and (epoch + 1) <= args.epochs: optimizer.param_groups[0]['lr'] *= 0.1 print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_loss_x, train_loss_u = train(labeled_trainloader, unlabeled_trainloader, model, target_model, optimizer, ema_optimizer, target_optimizer, train_criterion, epoch, use_cuda, rampup_length) _, train_me, train_fr = validate(labeled_trainloader, model, criterion, epoch, use_cuda, mode='Train Stats') val_loss, val_me, val_fr = validate(val_loader, ema_model, criterion, epoch, use_cuda, mode='Valid Stats') test_loss, test_me, test_fr = validate(test_loader, ema_model, criterion, epoch, use_cuda, mode='Test Stats ') step = args.batch_size * args.val_iteration * (epoch + 1) writer.add_scalar('losses/train_loss', train_loss, step) writer.add_scalar('losses/valid_loss', val_loss, step) writer.add_scalar('losses/test_loss', test_loss, step) writer.add_scalar('accuracy/train_me', train_me, step) writer.add_scalar('accuracy/val_me', val_me, step) writer.add_scalar('accuracy/test_me', test_me, step) writer.add_scalar('accuracy/train_fr', train_fr, step) writer.add_scalar('accuracy/val_fr', val_fr, step) writer.add_scalar('accuracy/test_fr', test_fr, step) # append logger file logger.append([train_loss, train_loss_x, train_loss_u, train_me, train_fr, val_loss, val_me, val_fr, test_loss, test_me, test_fr]) # save model is_best = val_me < best_acc best_acc = min(val_me, best_acc) if is_best: best_test = test_me save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'ema_state_dict': ema_model.state_dict(), 'me': val_me, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best) test_accs.append(test_me) logger.close() # writer.close() print('Best acc') print(best_test) print('Mean acc:') print(np.mean(test_accs[-20:]))
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.save_dir): mkdir_p(args.save_dir) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](dataset=args.dataset, depth=args.depth, reduction=args.reduction) print(model) if args.cuda: model.cuda() print(' Total params: %.2f' % (sum(p.numel() for p in model.parameters()))) with torch.cuda.device(0): net = model flops, params = get_model_complexity_info(net, (3, 32,32), as_strings=True, print_per_layer_stat=True) print('Flops: ' + flops) print('Params: ' + params) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.save_dir = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.save_dir, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.save_dir, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') with torch.no_grad(): test_loss, test_acc = test(testloader, model, criterion, start_epoch, args.cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.gamma, args.schedule) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, args.cuda) with torch.no_grad(): test_loss, test_acc = test(testloader, model, criterion, epoch, args.cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.save_dir) logger.close() print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) else: model = models.__dict__[args.arch](num_classes=num_classes) print("Geometric LR: {}".format(args.geo_lr)) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() param_lr = GradientRatioScheduler.get_params_base_lr(model, args.lr) optimizer = optim.SGD(param_lr, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = GradientRatioScheduler(optimizer) print_model(model) input("Cont?") # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Epoch', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Time', 'Learning Rate' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(scheduler, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, max(scheduler.get_lr()))) st = time.time() train_loss, train_acc = train(trainloader, model, criterion, optimizer, scheduler, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append([ epoch, train_loss, test_loss, train_acc, test_acc, time.time() - st, scheduler.get_lr() ]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) if args.save_checkpoint_model: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) class Jigsaw(object): # https://github.com/joe-siyuan-qiao/GUNN def __call__(self, tensor): tensor_1 = tensor.clone() tensor_2 = tensor.clone() return torch.cat([tensor_1, tensor_2], dim=0) h, w = tensor.size(1), tensor.size(2) d = random.randint(0, h - 1) if d > 0: tensor_u, tensor_d = tensor.narrow(1, 0, d), tensor.narrow(1, d, h - d) tensor_1.narrow(1, 0, h - d).copy_(tensor_d) tensor_1.narrow(1, h - d, d).copy_(tensor_u) d = random.randint(0, w - 1) if d > 0: tensor_l, tensor_r = tensor.narrow(2, 0, d), tensor.narrow(2, d, w - d) tensor_2.narrow(2, 0, w - d).copy_(tensor_r) tensor_2.narrow(2, w - d, d).copy_(tensor_l) return torch.cat([tensor_1, tensor_2], dim=0) def __repr__(self): return self.__class__.__name__ + '()' # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomSizedCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, Jigsaw(), ])), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Scale(int(256.0 / 224.0 * args.image_size)), transforms.CenterCrop(args.image_size), transforms.ToTensor(), normalize, ])), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model = torch.nn.DataParallel(model).cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) # define loss function (criterion) and optimizer criterion = nr.CrossEntropyLoss() optimizer = nr.SGD(model, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, sparsity=args.nr_sparsity_start, zero_prb=args.nr_zero_prb) # Resume title = 'ImageNet-' + args.arch is_rejuvenated = False util_params_old = 1.0 if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict'], strict=False) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) is_rejuvenated = checkpoint['is_rejuvenated'] util_params_old = checkpoint['util_params_old'] args.epochs = checkpoint['epochs'] args.schedule = checkpoint['schedule'] print('==> Resumed model size: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) if args.nr_bn_target > 0: print('==> Restoring BN to {:.2f}'.format(args.nr_bn_target)) model.module.bn_restore(args.nr_bn_target) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid 5 Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val if args.nr_target >= 0.999: is_rejuvenated = True epoch = start_epoch while epoch < args.epochs: adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, test_acc_5 = test(val_loader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc, test_acc_5]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), 'is_rejuvenated': is_rejuvenated, 'util_params_old': util_params_old, 'epochs': args.epochs, 'schedule': args.schedule, 'sparsity': optimizer.sparsity, }, is_best, checkpoint=args.checkpoint) epoch = epoch + 1 if not is_rejuvenated: args.epochs = args.epochs + 1 for schedule_idx in range(len(args.schedule)): args.schedule[schedule_idx] = args.schedule[schedule_idx] + 1 live_params, all_params = optimizer.inspect(flops_flag=args.nr_flops_flag) util_params = float(live_params) / float(all_params) print(' | Utilization {} at {}: {} {}'.format(util_params, optimizer.sparsity, live_params, all_params)) if util_params <= args.nr_target: shutil.copyfile(os.path.join(args.checkpoint, 'checkpoint.pth.tar'), os.path.join(args.checkpoint, 'NR_{}_before_{}.pth.tar'.format(args.arch, args.nr_target))) model = model.module model.rejuvenate(flops_flag=args.nr_flops_flag) model = torch.nn.DataParallel(model).cuda() optimizer = nr.SGD(model, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, sparsity=0, zero_prb=args.nr_zero_prb) is_rejuvenated = True save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), 'is_rejuvenated': is_rejuvenated, 'util_params_old': util_params_old, 'epochs': args.epochs, 'schedule': args.schedule, 'sparsity': optimizer.sparsity, }, False, checkpoint=args.checkpoint, filename='NR_{}_{}.pth.tar'.format(args.arch, args.nr_target)) if args.nr_compress_only: return if util_params_old - util_params < args.nr_increase_thres: optimizer.sparsity += args.nr_increase_step util_params_old = util_params logger.close() # logger.plot() # savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): chkpoint_name = args.checkpoint[args.checkpoint.rfind('/') + 1:] # import pdb; pdb.set_trace() global writer writer = SummaryWriter(log_dir='/runs/' + chkpoint_name) global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # writer = # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) # model = torch.nn.DataParallel(model).cuda() # model = model.cuda(torch.device('cuda:1')) model = model.cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) torch.save(model, 'tempolary.pth') new_model = torch.load('tempolary.pth') criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) print(model) import pdb pdb.set_trace() look_up_table = get_look_up_table(model) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) if DEBUG: # print(model) show_low_rank(model, input_size=[32, 32], criterion=ValueThreshold(t), type=args.type) print(' Start decomposition:') # set different threshold for model compression and test accuracy all_channs_ = [] thresholds = np.arange( 0.1, 1.0, 0.01).tolist() #[5e-2] if args.type != 'ND' else [0.85] sigma_criterion = ValueThreshold if args.type != 'ND' else EnergyThreshold T = np.array(thresholds) cr = np.zeros(T.shape) acc = np.zeros(T.shape) model_path = 'net.pth' torch.save(model, model_path) result = 'result.pth' if not args.retrain else 'result-retrain.pth' for i, t in enumerate(thresholds): test_model = torch.load(model_path) print('=====================================Threshold is', t) cr[i], channs_ = show_low_rank(test_model, look_up_table, input_size=[32, 32], criterion=sigma_criterion(t), type=args.type) all_channs_.append(channs_) test_model = f_decouple(test_model, look_up_table, criterion=sigma_criterion(t), train=False, stride_1_only=args.stride_1_only) #print(model) print(' Done! test decoupled model') test_loss, test_acc = test(testloader, test_model, criterion, start_epoch, use_cuda) print(' Test Loss : %.8f, Test Acc: %.2f' % (test_loss, test_acc)) acc[i] = test_acc if args.retrain: # retrain model finetune_epoch = 4 acc[i] = model_retrain(finetune_epoch, test_model, trainloader, \ testloader, criterion, look_up_table, use_cuda) torch.save(test_model, 'model.pth.tar') torch.save(OrderedDict([('acc', acc), ('cr', cr)]), result) print('compression ratio:') print(cr) print('accuracy:') print(acc) print(all_channs_) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, look_up_table, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) args.distributed = True args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() print('world_size = ', args.world_size) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif 'resnext' in args.arch: model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() flops, params = get_model_complexity_info(model, (224, 224), as_strings=False, print_per_layer_stat=False) print('Flops: %.3f' % (flops / 1e9)) print('Params: %.2fM' % (params / 1e6)) cudnn.benchmark = True # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() criterion = SoftCrossEntropyLoss(label_smoothing=args.label_smoothing).cuda() model = model.cuda() args.lr = float(0.1 * float(args.train_batch*args.world_size)/256.) state['lr'] = args.lr optimizer = set_optimizer(model) #optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) #model = torch.nn.DataParallel(model).cuda() #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model = DDP(model, delay_allreduce=True) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'valf') #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) data_aug_scale = (0.08, 1.0) train_dataset = datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomResizedCrop(224, scale = data_aug_scale), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, ])) val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, ])) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..', args.resume) assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) #checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] # model may have more keys t = model.state_dict() c = checkpoint['state_dict'] for k in t: if k not in c: print('not in loading dict! fill it', k, t[k]) c[k] = t[k] model.load_state_dict(c) print('optimizer load old state') optimizer.load_state_dict(checkpoint['optimizer']) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # save model if args.local_rank == 0: # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) if args.local_rank == 0: logger.close() print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.endswith('resnet_mapping'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, MaplabelInit = init_label, feature_num=num_classes, Stable=args.stable, InitFactor = args.Ifactor, Dmode ='Dx', Afun = 'Identity', Layeradjust = 'toLabel' ) else: model = models.__dict__[args.arch](num_classes=num_classes) #print(model) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) # Resume title = 'cifar-10-' + args.arch tfboard_dir=[] if args.tfboard_dir: tfboard_dir = args.tfboard_dir else: tfboard_dir = args.checkpoint criterion = nn.MSELoss() others = [] maplable = [] for name, p in model.named_parameters(): if 'maplabel' in name: print(p) maplable += [p] else: others += [p] # pfront = parmlist.remove(model.module.maplabel) optimizer = optim.SGD([{'params': others}, {'params': maplable, 'lr': 0}, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), tfboard_dir=tfboard_dir,title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), tfboard_dir=tfboard_dir,title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return b = args.b for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) #index = m.sample() #b=blist[index] print('\nEpoch: [%d | %d] LR: %f dir: %s' % (epoch + 1, args.epochs, state['lr'],args.checkpoint)) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda,b) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) info = { 'train_acc': train_acc, \ 'test_acc': test_acc} for tag, value in info.items(): logger.scalar_summary(tag,value,epoch+1) if use_mapping: tag = '/maplabel' logger.histo_summary(tag, model.module.maplabel.data.cpu().numpy(), epoch + 1) if model.module.maplabel.grad is not None: tag = '/maplabel/grad' logger.histo_summary(tag, model.module.maplabel.grad.data.cpu().numpy(), epoch + 1) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.jpg')) print('Best acc:') print(best_acc)
def main(): global best_acc # global record start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('===> Preparing dataset cifar10...') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) dataloader = datasets.CIFAR10 num_classes = 10 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> Creating Model ResNet") model = resnet.ResNet() print("==> Create Successfully!") # print(model) if not args.fixbit: print(" Train both model and bitwise parameters...") else: print( " Train model with trained fixed bit number and parameters...") model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + 'ResNet-with-BIB' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): resnet.RecordActivation = False resnet.FirstEpoch = False if not args.fixbit: if epoch == start_epoch: resnet.FirstEpoch = False else: resnet.FirstEpoch = False else: resnet.FirstEpoch = False adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) if not args.fixbit: calculate_optimal_alpha.cal_quant() logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader(datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomSizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.']) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if(args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=8, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) if args.evaluate: validate(val_loader, model, criterion) return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch,args) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) [train_loss, train_acc, avg_train_time] = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) # evaluate on validation set [test_loss, prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: # append logger file logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5]) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best,checkpoint=args.checkpoint) if epoch == args.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators train_loader.reset() val_loader.reset() if args.local_rank == 0: logger.close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ if eval: print("Evaluating: ") evaluate(env_name, num_episodes) exit() killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #policy.restore_weights() ## ------------- #val_func.restore_weights() ## ------------- # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False print("Scaler vars,means: ") print(scaler.vars, scaler.means) for i in range(3): run_episode(env, policy, scaler, animate=True) #policy.save_weights() #val_func.save_weights() #WARNING: scaler is disabled logger.close() policy.close_sess() val_func.close_sess()
class Dispatcher(object): ''' nested crawl dispatcher ''' def __init__(self, queue, cache, st): self.queue = queue self.cache = cache self.keywords = st.args.keywords self.args = st.args self.conf = st.conf self.num_of_pages = 0 self.bytes_of_pages = 0 self.max_num_pages = st.args.num self.logger = Logger(self.conf['crawl']['crawl_log']) self.stats = CrawlStats(self.conf['crawl']['crawl_stats']) self.log_queue = Queue.Queue() self.end_page_log_item = Page('none', 0, 0) # shutdown signal self.shutdown = False def bulk_url_enqueue(self, urls): ''' add a list of URLs into the crawl queue ''' for url in urls: page = Page(url, depth=1, score=9) self.queue.en_queue(page) def store_page(self, page): ''' store cralwed page into persistent store ''' try: fileto = os.path.join(self.conf['crawl']['crawl_pages'], page.store) dirname = os.path.dirname(fileto) if not os.path.exists(dirname): os.makedirs(dirname) with open(fileto, 'wb') as fpw: fpw.write(page.content.encode('utf-8')) except Exception as e: print('Error writing back %s: %s' % (page.url, str(e))) def call_crawl_page(self, page): ''' thread function to be called ''' GenericPageCrawler(page, self.queue, self.cache, self.log_queue, self.keywords, self.args.fake) def run_page_crawler(self): ''' listen to crawler priority queue and crawl pages ''' worker = Worker(self.args.thread) while True: # get one item from the queue # initialize a generic crawler instance page = self.queue.de_queue() if page: self.stats.crawl_in_progress += 1 page.time_dequeue = time.time() worker.add_task(self.call_crawl_page, page) if self.stats.crawl_in_progress == self.max_num_pages: break worker.join() self.shutdown = True self.log_queue.put(self.end_page_log_item) def run_log_writter(self): ''' listen to log FIFO queue and write back crawl logs ''' page_end = self.end_page_log_item while True: # get one log item from the queue # 1. write back crawl page meta info including page link, size, depth, score, etc. # 2. write crawled page back to the persistent storage page = self.log_queue.get() if page and page != page_end: page_size = page.size self.stats.update_crawl_bytes(page_size) self.store_page(page) self.logger.log(page) if page.error == 1: self.stats.update_crawl_count(1, success=False) else: self.stats.update_crawl_count(1, success=True) if page == page_end: break def run_progress_reporter(self): ''' report crawling progress every N seconds N is based on configuration or argument -i (interval) reporting metrics include: - # of pages crawled. - successful rate. - # % finished. - # pages / sec - # bytes / sec ''' # generate current progress report def progress_report_current(): crawl_total = self.max_num_pages crawled_pages = self.stats.crawled_pages progress = '%d/%d pages crawled. [%d%%] finished' % \ (crawled_pages, crawl_total, (100*crawled_pages/crawl_total)) return progress # generate current progress report END try: interval = self.conf['report']['interval'] except Exception: interval = 3 while True: print(time.ctime() + '\t' + progress_report_current()) time.sleep(interval) if self.shutdown: break # print finish statistics print(time.ctime() + '\t' + progress_report_current()) print('') self.stats.write_crawl_stats(sys.stdout) print('crawl finished.') def run(self): ''' run the dispatcher ''' # crawl google web search engine gs = GoogleWebCrawler(self.keywords, self.args.fake) urls = gs.query() if not urls and gs.error > 0: print('Network Error. Please check network connection.') return if not urls: bs = BingWebCrawler(self.keywords, self.args.fake) urls = bs.query() if not urls: print('See crawl failed. Please check network connection or contact the author.') return self.bulk_url_enqueue(urls) # launch the crawler thread t_crawler = threading.Thread(target=self.run_page_crawler) t_crawler.daemon = True t_crawler.start() # launch the log writer thread t_logger = threading.Thread(target=self.run_log_writter) t_logger.daemon = True t_logger.start() # launch the progress reporter t_reporter = threading.Thread(target=self.run_progress_reporter) t_reporter.daemon = True t_reporter.start() # wait for the workers to finish t_crawler.join() t_logger.join() # finalize statistical metrics self.stats.finalize() # close reporter t_reporter.join() # close logger self.logger.close()
def main(): # prepare dir if not os.path.exists('./snapshots'): os.mkdir('./snapshots') if not os.path.exists('./logdir'): os.mkdir('./logdir') global args, best_prec1 args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](width_mult=args.width_mult) if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) args.checkpoint = os.path.dirname(args.resume) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) cudnn.benchmark = True # Data loading code if args.data_backend == 'pytorch': get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == 'dali-gpu': get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == 'dali-cpu': get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() train_loader, train_loader_len = get_train_loader( args.data, args.batch_size, workers=args.workers, input_size=args.input_size) val_loader, val_loader_len = get_val_loader(args.data, args.batch_size, workers=args.workers, input_size=args.input_size) if args.evaluate: from collections import OrderedDict if os.path.isfile(args.weight): print("=> loading pretrained weight '{}'".format(args.weight)) source_state = torch.load(args.weight) target_state = OrderedDict() for k, v in source_state.items(): if k[:7] != 'module.': k = 'module.' + k target_state[k] = v model.load_state_dict(target_state) else: print("=> no weight found at '{}'".format(args.weight)) validate(val_loader, val_loader_len, model, criterion) return # visualization writer = SummaryWriter(os.path.join(args.checkpoint, 'logs')) for epoch in range(args.start_epoch, args.epochs): t1 = time.time() if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss, train_acc = train(train_loader, train_loader_len, model, criterion, optimizer, epoch) # evaluate on validation set val_loss, prec1 = validate(val_loader, val_loader_len, model, criterion) elapse = time.time() - t1 h, m, s = eta_time(elapse, args.epochs - epoch - 1) lr = optimizer.param_groups[0]['lr'] # append logger file logger.append([lr, train_loss, val_loss, train_acc, prec1]) # tensorboardX writer.add_scalar('learning rate', lr, epoch + 1) writer.add_scalars('loss', { 'train loss': train_loss, 'validation loss': val_loss }, epoch + 1) writer.add_scalars('accuracy', { 'train accuracy': train_acc, 'validation accuracy': prec1 }, epoch + 1) if prec1 > best_prec1: best_prec1 = max(prec1, best_prec1) torch.save(obj={ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, f=os.path.join(args.checkpoint, 'model.pth.tar')) print( '\n[Epoch: {:.0f} | {:.0f}], val: loss={:.6}, top1={:.6}, best=\033[31m{:.6}\033[0m, elapse={:.2f}min, eta={:.0f}h {:.0f}m {:.0f}s' .format(epoch + 1, args.epochs, val_loss, prec1, best_prec1, elapse / 60, h, m, s)) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) writer.close() print('Best accuracy:', best_prec1)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) print(os.path.isdir(args.checkpoint)) # Data transform = get_transforms(input_size=args.image_size, test_size=args.image_size, backbone=None) print('==> Preparing dataset %s' % args.trainroot) trainset = dataset.Dataset(root=args.trainroot, transform=transform['val_train']) train_loader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) valset = dataset.TestDataset(root=args.valroot, transform=transform['val_test']) val_loader = data.DataLoader(valset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) model = make_model(args) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = get_optimizer(model, args) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5, verbose=False) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.module.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) train_loss, train_acc, train_5 = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, test_5 = test(val_loader, model, criterion, epoch, use_cuda) scheduler.step(test_loss) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) print( 'train_loss:%f, val_loss:%f, train_acc:%f, train_5:%f, val_acc:%f, val_5:%f' % (train_loss, test_loss, train_acc, train_5, test_acc, test_5)) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) if len(args.gpu_id) > 1: save_checkpoint( { 'fold': 0, 'epoch': epoch + 1, 'state_dict': model.module.state_dict(), 'train_acc': train_acc, 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, single=True, checkpoint=args.checkpoint) else: save_checkpoint( { 'fold': 0, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'train_acc': train_acc, 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, single=True, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc, num_classes start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data transform_train = transforms.Compose([ transforms.ToPILImage(), transforms.RandomCrop(64, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToPILImage(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) restrict = [int(part) for part in args.restrict.split(",") ] if args.restrict else None num_classes = len(restrict) if restrict else 1000 trainset = train_imagenet64(args.dataset, transform=transform_train, filter=restrict) testset = test_imagenet64(args.dataset, transform=transform_test, filter=restrict) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root=args.datapath, train=True, download=False, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root=args.datapath, train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) modeltype = type(model).__name__ model,classifier = decomposeModel(model) classifier = classifier.cuda() model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) reg_net = None optimizer_reg = optim.SGD(classifier.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.mtype == 'baseline': args.dcl_refsize = 0 reg_net = regressor.Net(classifier, optimizer_reg, ref_size=args.dcl_refsize, backendtype=modeltype) elif args.mtype == 'dcl': reg_net = regressor.Net(classifier, optimizer_reg, ref_size=args.dcl_refsize, backendtype=modeltype, dcl_offset=args.dcl_offset, dcl_window=args.dcl_window) elif args.mtype == 'gem': reg_net = gem.Net(classifier, optimizer_reg, n_memories=args.gem_memsize, backendtype=modeltype) print(args) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Err.', 'Valid Err.', 'Cos Bef.', 'Cos Aft.', 'Mag']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate_two(optimizer, optimizer_reg, epoch) print('Epoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_top1_acc, train_top5_acc, cong_bf, cong_af, mag = train(trainloader, model, criterion, optimizer, epoch, use_cuda, reg_net=reg_net) test_loss, test_top1_acc, test_top5_acc = test(testloader, model, criterion, epoch, use_cuda, reg_net=reg_net) print('train_loss: {:.4f}, train_top1_err: {:.2f}, train_top5_err: {:.2f}, test_loss: {:.4f}, test_top1_err: {:.2f}, test_top5_err: {:.2f}, cong_bf: {:.4f}, cong_af: {:.4f}, mag: {:.4f}'.format( train_loss, 100-train_top1_acc, 100-train_top5_acc, test_loss, 100-test_top1_acc, 100-test_top5_acc, cong_bf, cong_af, mag)) # append logger file logger.append([state['lr'], train_loss, test_loss, 100-train_top1_acc, 100-test_top1_acc, cong_bf, cong_af, mag]) # save model is_best = test_top1_acc > best_acc best_acc = max(test_top1_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'classifier_state_dict': reg_net.state_dict(), 'acc': test_top1_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), 'cong_bf': cong_bf, 'cong_af': cong_af, 'mag': mag, }, is_best, checkpoint=args.checkpoint) logger.close() print('Best err:') print(100-best_acc)
def main(): global best_acc if not os.path.isdir(args.out): mkdir_p(args.out) # Data print(f'==> Preparing cifar10') transform_train = transforms.Compose([ dataset.RandomPadandCrop(32), dataset.RandomFlip(), dataset.ToTensor(), ]) transform_val = transforms.Compose([ dataset.ToTensor(), ]) train_labeled_set, train_unlabeled_set, val_set, test_set = dataset.get_cifar10( './data', args.n_labeled, transform_train=transform_train, transform_val=transform_val) labeled_trainloader = data.DataLoader(train_labeled_set, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True) unlabeled_trainloader = data.DataLoader(train_unlabeled_set, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=True) val_loader = data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=0) test_loader = data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=0) # Model print("==> creating WRN-28-2") def create_model(ema=False): model = models.WideResNet(num_classes=10) model = model.to(dev) if ema: for param in model.parameters(): param.detach_() return model model = create_model() ema_model = create_model(ema=True) cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) train_criterion = SemiLoss() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) ema_optimizer = WeightEMA(model, ema_model, alpha=args.ema_decay) start_epoch = 0 # Resume title = 'noisy-cifar-10' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.out = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) ema_model.load_state_dict(checkpoint['ema_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.out, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.out, 'log.txt'), title=title) logger.set_names([ 'Train Loss', 'Train Loss X', 'Train Loss U', 'Valid Loss', 'Valid Acc.', 'Test Loss', 'Test Acc.' ]) writer = SummaryWriter(args.out) step = 0 test_accs = [] # Train and val for epoch in range(start_epoch, args.epochs): print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_loss_x, train_loss_u = train( labeled_trainloader, unlabeled_trainloader, model, optimizer, ema_optimizer, train_criterion, epoch, use_cuda) _, train_acc = validate(labeled_trainloader, ema_model, criterion, epoch, use_cuda, mode='Train Stats') val_loss, val_acc = validate(val_loader, ema_model, criterion, epoch, use_cuda, mode='Valid Stats') test_loss, test_acc = validate(test_loader, ema_model, criterion, epoch, use_cuda, mode='Test Stats ') step = args.train_iteration * (epoch + 1) writer.add_scalar('losses/train_loss', train_loss, step) writer.add_scalar('losses/valid_loss', val_loss, step) writer.add_scalar('losses/test_loss', test_loss, step) writer.add_scalar('accuracy/train_acc', train_acc, step) writer.add_scalar('accuracy/val_acc', val_acc, step) writer.add_scalar('accuracy/test_acc', test_acc, step) # append logger file logger.append([ train_loss, train_loss_x, train_loss_u, val_loss, val_acc, test_loss, test_acc ]) # save model is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'ema_state_dict': ema_model.state_dict(), 'acc': val_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best) test_accs.append(test_acc) logger.close() writer.close() print('Best acc:') print(best_acc) print('Mean acc:') print(np.mean(test_accs[-20:]))
def main(): global BEST_ACC, LR_STATE start_epoch = cfg.CLS.start_epoch # start from epoch 0 or last checkpoint epoch # Create ckpt folder if not os.path.isdir(cfg.CLS.ckpt): mkdir_p(cfg.CLS.ckpt) if args.cfg_file is not None and not cfg.CLS.evaluate: shutil.copyfile(args.cfg_file, os.path.join(cfg.CLS.ckpt, args.cfg_file.split('/')[-1])) # Dataset and Loader normalize = transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std) train_aug = [transforms.RandomResizedCrop(cfg.CLS.crop_size), transforms.RandomHorizontalFlip()] if len(cfg.CLS.rotation) > 0: train_aug.append(transforms.RandomRotation(cfg.CLS.rotation)) if len(cfg.CLS.pixel_jitter) > 0: train_aug.append(RandomPixelJitter(cfg.CLS.pixel_jitter)) if cfg.CLS.grayscale > 0: train_aug.append(transforms.RandomGrayscale(cfg.CLS.grayscale)) train_aug.append(transforms.ToTensor()) train_aug.append(normalize) traindir = os.path.join(cfg.CLS.data_root, cfg.CLS.train_folder) train_loader = torch.utils.data.DataLoader( datasets.ImageFolder(traindir, transforms.Compose(train_aug)), batch_size=cfg.CLS.train_batch, shuffle=True, num_workers=cfg.workers, pin_memory=True) if cfg.CLS.validate or cfg.CLS.evaluate: valdir = os.path.join(cfg.CLS.data_root, cfg.CLS.val_folder) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(cfg.CLS.base_size), transforms.CenterCrop(cfg.CLS.crop_size), transforms.ToTensor(), normalize, ])), batch_size=cfg.CLS.test_batch, shuffle=False, num_workers=cfg.workers, pin_memory=True) if cfg.CLS.teacher_pretrained: if cfg.CLS.torchvision_pretrain: #teacher_model = torchvision.models.resnet50(pretrained=True) T_MODEL = eval(cfg.CLS.torchvision_pretrain) print('Teacher pretrained model init done!!!') else: print('Teacher pretrained model soeaver done!') T_MODEL = models.__dict__[cfg.CLS.teacher_arch]() t_checkpoint = torch.load(cfg.CLS.teacher_weights) try: teacher_weight_dict = t_checkpoint['state_dict'] except: teacher_weight_dict = t_checkpoint teacher_model_dict = T_MODEL.state_dict() if cfg.CLS.kd_loss: loss_kd = loss_fn_kd # Create model if cfg.CLS.teacher_pretrained: model = models.__dict__[cfg.CLS.arch](teacher_model = T_MODEL) else: model = models.__dict__[cfg.CLS.arch]() print(model) # Calculate FLOPs & Param #n_flops, n_convops, n_params = measure_model(model, cfg.CLS.crop_size, cfg.CLS.crop_size) #print('==> FLOPs: {:.4f}M, Conv_FLOPs: {:.4f}M, Params: {:.4f}M'. # format(n_flops / 1e6, n_convops / 1e6, n_params / 1e6)) #del model #if cfg.CLS.teacher_pretrained: # model = models.__dict__[cfg.CLS.arch](teacher_model = T_MODEL) #else: # model = models.__dict__[cfg.CLS.arch]() # Load pre-train model if cfg.CLS.pretrained: print("==> Using pre-trained model '{}'".format(cfg.CLS.pretrained)) pretrained_dict = torch.load(cfg.CLS.pretrained) try: pretrained_dict = pretrained_dict['state_dict'] except: pretrained_dict = pretrained_dict model_dict = model.state_dict() updated_dict, match_layers, mismatch_layers = weight_filler(pretrained_dict, model_dict) model_dict.update(updated_dict) model.load_state_dict(model_dict) else: print("==> Creating model '{}'".format(cfg.CLS.arch)) if cfg.CLS.teacher_pretrained: print("==> Using pre-trained teacher model '{}'".format(cfg.CLS.teacher_pretrained)) model_dict = model.state_dict() t_updated_dict, t_match_layers, t_mismatch_layers = weight_filler(teacher_weight_dict, model_dict) model_dict.update(t_updated_dict) model.load_state_dict(model_dict) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if cfg.CLS.pretrained: def param_filter(param): return param[1] new_params = map(param_filter, filter(lambda p: p[0] in mismatch_layers, model.named_parameters())) base_params = map(param_filter, filter(lambda p: p[0] in match_layers, model.named_parameters())) model_params = [{'params': base_params}, {'params': new_params, 'lr': cfg.CLS.base_lr * 10}] elif cfg.CLS.teacher_pretrained: def param_filter(param): return param[1] t_new_params = map(param_filter, filter(lambda p: p[0] in t_mismatch_layers, model.named_parameters())) t_base_params = map(param_filter, filter(lambda p: p[0] in t_match_layers, model.named_parameters())) model_params = [{'params': t_base_params}, {'params': t_new_params}] else: model_params = model.parameters() model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True optimizer = optim.SGD(model_params, lr=cfg.CLS.base_lr, momentum=cfg.CLS.momentum, weight_decay=cfg.CLS.weight_decay) # Evaluate model if cfg.CLS.evaluate: print('\n==> Evaluation only') test_loss, test_top1, test_top5 = test(val_loader, model, criterion, start_epoch, USE_CUDA) print('==> Test Loss: {:.8f} | Test_top1: {:.4f}% | Test_top5: {:.4f}%'.format(test_loss, test_top1, test_top5)) return # Resume training title = 'Pytorch-CLS-' + cfg.CLS.arch if cfg.CLS.resume: # Load checkpoint. print("==> Resuming from checkpoint '{}'".format(cfg.CLS.resume)) assert os.path.isfile(cfg.CLS.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(cfg.CLS.resume) BEST_ACC = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(cfg.CLS.ckpt, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(cfg.CLS.ckpt, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) # Train and val for epoch in range(start_epoch, cfg.CLS.epochs): print('\nEpoch: [{}/{}] | LR: {:.8f}'.format(epoch + 1, cfg.CLS.epochs, LR_STATE)) if cfg.CLS.kd_loss: print('kd_train init done!!') alpha = cfg.CLS.alpha temperature = cfg.CLS.temperature train_loss, train_acc = train_kd(train_loader, model, loss_kd, optimizer, epoch, alpha, temperature, USE_CUDA) else: train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, USE_CUDA) if cfg.CLS.validate: test_loss, test_top1, test_top5 = test(val_loader, model, criterion, epoch, USE_CUDA) else: test_loss, test_top1, test_top5 = 0.0, 0.0, 0.0 # Append logger file logger.append([LR_STATE, train_loss, test_loss, train_acc, test_top1]) # Save model save_checkpoint(model, optimizer, test_top1, epoch) # Draw curve try: draw_curve(cfg.CLS.arch, cfg.CLS.ckpt) print('==> Success saving log curve...') except: print('==> Saving log curve error...') logger.close() try: savefig(os.path.join(cfg.CLS.ckpt, 'log.eps')) shutil.copyfile(os.path.join(cfg.CLS.ckpt, 'log.txt'), os.path.join(cfg.CLS.ckpt, 'log{}.txt'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')))) except: print('Copy log error.') print('==> Training Done!') print('==> Best acc: {:.4f}%'.format(BEST_ACC))
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) elif args.arch.startswith('preresnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) else: print('Model is specified wrongly - Use standard model') model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=args.warmup) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'srsgd': iter_count = 1 optimizer = SRSGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[0]) # Resume title = 'cifar-10-' + args.arch logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) schedule_index = 1 # Resume title = '%s-'%args.dataset + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.optimizer.lower() == 'srsgd': iter_count = optimizer.param_groups[0]['iter_count'] schedule_index = checkpoint['schedule_index'] state['lr'] = optimizer.param_groups[0]['lr'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): if args.optimizer.lower() == 'srsgd': if epoch in args.schedule: optimizer = SRSGD(model.parameters(), lr=args.lr * (args.gamma**schedule_index), weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 else: adjust_learning_rate(optimizer, epoch) logger.file.write('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) if args.optimizer.lower() == 'srsgd': train_loss, train_acc, iter_count = train(trainloader, model, criterion, optimizer, epoch, use_cuda, logger) else: train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda, logger) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda, logger) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch) writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch) writer.add_scalars('train_acc', {args.model_name: train_acc}, epoch) writer.add_scalars('test_acc', {args.model_name: test_acc}, epoch) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'schedule_index': schedule_index, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, epoch, checkpoint=args.checkpoint) logger.file.write('Best acc:%f'%best_acc) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc) with open("./all_results.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n"%args.checkpoint) f.write("best_acc %f\n\n"%best_acc) fcntl.flock(f, fcntl.LOCK_UN)
with torch.no_grad(): end = time.time() for i, (data, target) in enumerate(test_loader): data = data.cuda() target = target.cuda() # compute output pred = classifier(model(data)) loss = F.cross_entropy(pred, target, reduction='mean') # measure accuracy and record loss top1, = accuracy(pred, target, topk=(1,)) losses.update(loss.item(), data.size(0)) acc.update(top1.item(), data.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logger.info('Test: [{0}/{1}] Time {btime.val:.3f} (avg={btime.avg:.3f}) ' 'Test Loss {loss.val:.3f} (avg={loss.avg:.3f}) ' 'Acc {acc.val:.3f} (avg={acc.avg:.3f})' \ .format(i, len(test_loader), btime=batch_time, loss=losses, acc=acc)) logger.info(' * Accuracy {acc.avg:.5f}'.format(acc=acc)) return acc.avg # Train and evaluate the model main() writer.close() logger.close()
def main(): best_acc = 0 start_epoch = args.start_epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) trainloader = getdata(args, train=True) testloader = getdata(args, train=False) model = VGG(args.attention, args.nclass) if args.gpu: if torch.cuda.is_available(): model = model.cuda() cudnn.benchmark = True else: print( 'There is no cuda available on this machine use cpu instead.') args.gpu = False criterion = nn.CrossEntropyLoss() optimizer = '' if args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: print(args.optimizer, 'is not correct') return title = 'cifar-10-' + args.attention if args.evaluate: print('\nEvaluation only') assert os.path.isfile( args.evaluate), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.evaluate) model.load_state_dict(checkpoint['state_dict']) test_loss, test_acc = test(model, testloader, criterion, args.gpu) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, state['attention'] + '-' + 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, state['attention'] + '-' + 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) for epoch in range(start_epoch, args.epochs): start_time = time.time() adjust_learning_rate(optimizer, epoch) train_loss, train_acc = train(model, trainloader, criterion, optimizer, epoch, args.gpu) test_loss, test_acc = test(model, testloader, criterion, args.gpu) if sys.version[0] == '3': train_acc = train_acc.cpu().numpy().tolist()[0] test_acc = test_acc.cpu().numpy().tolist()[0] logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'attention': state['attention'], }, is_best, checkpoint=args.checkpoint) print(time.time() - start_time) print( "epoch: {:3d}, lr: {:.8f}, train-loss: {:.3f}, test-loss: {:.3f}, train-acc: {:2.3f}, test_acc:, {:2.3f}" .format(epoch, state['lr'], train_loss, test_loss, train_acc, test_acc)) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, state['attention'] + '-' + 'log.eps')) print('Best acc:', best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch os.makedirs(args.save_dir, exist_ok=True) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ #frequence_func, transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ #frequence_func, transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) model.cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.init_pth is None: save_checkpoint({'state_dict': model.state_dict()}, False, checkpoint=args.save_dir, filename='init.pth.tar') else: init_checkpoint = torch.load(args.init_pth) model.load_state_dict(init_checkpoint['state_dict']) if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.save_dir = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.save_dir, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.save_dir, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return train_losses = [] train_acces = [] test_losses = [] test_acces = [] # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) train_losses.append(train_loss) train_acces.append(train_acc) test_acces.append(test_acc) test_losses.append(test_loss) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.save_dir) logger.close() sns.lineplot(x=range(len(test_acces)), y=test_acces, color='red',dashes=True) plt.xlabel("episode") plt.ylabel("accuracy") plt.legend() plt.savefig(os.path.join(args.save_dir,"test.png")) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # load data print('==> Preparing dataset %s' % args.dataset) features, landmarks_g, landmarks_h, labels = pickle_2_img_and_landmark_ag( args.dataset_path) num_classes = 6 # Model print("==> creating model '{}'".format(args.arch)) # model = ResNetAndGCN(20, num_classes=num_classes) model = ModelAGATT(2, 36, 6, {}, False, dropout=0.3) # model = torch.nn.DataParallel(model).cuda() model = model.cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) # print(' resnet params: %.2fM' % (sum(p.numel() for p in model.resnet.parameters())/1000000.0)) # print(' stgcn params: %.2fM' % (sum(p.numel() for p in model.st_gcn.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() # 分层优化 # resnet_para = [model.conv1.parameters(), model.layer1.parameters(), model.layer2.parameters(), model.layer3.parameters(), model.layer4.parameters()] # optimizer = optim.SGD([ # {'params': model.gcn11.parameters()}, # {'params': model.gcn12.parameters()}, # {'params': model.gcn21.parameters()}, # {'params': model.gcn22.parameters()}, # {'params': model.gcn31.parameters()}, # {'params': model.gcn32.parameters()}, # {'params': model.fc.parameters()}, # {'params': model.conv1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.bn1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer2.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer3.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer4.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # Resume title = 'ckp-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log_stat.log'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log_stat.log'), title=title) logger.set_names([ 'fold_num', 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) # logging logging.basicConfig(level=logging.DEBUG, filename=os.path.join(args.checkpoint, 'log_info.log'), filemode='a+', format="%(asctime)-15s %(levelname)-8s %(message)s") # log configuration logging.info('-' * 10 + 'configuration' + '*' * 10) for arg in vars(args): logging.info((arg, str(getattr(args, arg)))) acc_fold = [] reset_lr = state['lr'] for f_num in range(args.folds): state['lr'] = reset_lr model.reset_all_weights() # optimizer = optim.SGD([ # {'params': model.gcn11.parameters()}, # {'params': model.gcn12.parameters()}, # {'params': model.gcn21.parameters()}, # {'params': model.gcn22.parameters()}, # {'params': model.gcn31.parameters()}, # {'params': model.gcn32.parameters()}, # {'params': model.fc.parameters()}, # {'params': model.conv1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.bn1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer1.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer2.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer3.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # {'params': model.layer4.parameters(), 'lr': 0.005, 'weight_decay': 5e-3}, # ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) print(args.lr) # save each fold's acc and reset configuration average_acc = 0 best_acc = 0 # 10-fold cross validation train_x, train_lm_g, train_lm_h, train_y = [], [], [], [] test_x, test_lm_g, test_lm_h, test_y = [], [], [], [] for id_fold in range(args.folds): if id_fold == f_num: test_x = features[id_fold] test_lm_g = landmarks_g[id_fold] test_lm_h = landmarks_h[id_fold] test_y = labels[id_fold] else: train_x = train_x + features[id_fold] train_lm_g = train_lm_g + landmarks_g[id_fold] train_lm_h = train_lm_h + landmarks_h[id_fold] train_y = train_y + labels[id_fold] # convert array to tensor train_x = torch.tensor(train_x, dtype=torch.float) / 255.0 #(b_s, 128, 128) train_x = train_x.unsqueeze(1) #(b_s, 1, 128, 128) train_lm_g = np.stack(train_lm_g) train_lm_h = np.stack(train_lm_h) # 只要坐标信息, 不需要归一化 # train_lm_g = (train_lm_g - np.mean(train_lm_g, axis=0)) / np.std(train_lm_g, axis=0) # train_lm_g = (train_lm_g - np.amin(train_lm_g)) / np.amax(train_lm_g) - np.amin(train_lm_g) # train_lm_h = (train_lm_h - np.mean(train_lm_h, axis=0)) / np.std(train_lm_h, axis=0) train_lm_g = torch.tensor(train_lm_g, dtype=torch.float) train_lm_h = torch.tensor(train_lm_h, dtype=torch.float) # train_lm = train_lm.unsqueeze(2) test_x = torch.tensor(test_x, dtype=torch.float) / 255.0 test_x = test_x.unsqueeze(1) # 只要坐标信息, 不需要归一化 # test_lm_g = (test_lm_g - np.mean(test_lm_g, axis=0)) / np.std(test_lm_g, axis=0) # test_lm_g = (test_lm_g - np.amin(test_lm_g)) / np.amax(test_lm_g) - np.amin(test_lm_g) # test_lm_h = (test_lm_h - np.mean(test_lm_h, axis=0)) / np.std(test_lm_h, axis=0) test_lm_g = torch.tensor(test_lm_g, dtype=torch.float) test_lm_h = torch.tensor(test_lm_h, dtype=torch.float) # test_lm = test_lm.unsqueeze(2) train_y, test_y = torch.tensor(train_y), torch.tensor(test_y) train_dataset = torch.utils.data.TensorDataset(train_x, train_lm_g, train_lm_h, train_y) train_iter = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.train_batch, shuffle=True) test_dataset = torch.utils.data.TensorDataset(test_x, test_lm_g, test_lm_h, test_y) test_iter = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.test_batch, shuffle=False) # test for fold order print(len(test_dataset)) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(train_x + test_x, train_y + test_y, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) continue # show plt # plt.show(block=False) # Train and val for epoch in range(start_epoch, args.epochs): # 在特定的epoch 调整学习率 adjust_learning_rate(optimizer, epoch) # print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) train_loss, train_acc = train(train_iter, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(test_iter, model, criterion, epoch, use_cuda) # append logger file logger.append([ f_num, state['lr'], train_loss, test_loss, train_acc, test_acc ]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, f_num, checkpoint=args.checkpoint) # compute average acc acc_fold.append(best_acc) average_acc = sum(acc_fold) / len(acc_fold) logging.info('fold: %d, best_acc: %.2f, average_acc: %.2f' % (f_num, best_acc, average_acc)) logger.close() # logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) logging.info('acc_fold' + str(acc_fold)) print('average acc:') print(average_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 elif args.dataset == 'cifar100': dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=1) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=1) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ratio=args.ratio ) else: model = models.__dict__[args.arch](num_classes=num_classes, ratio=args.ratio) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: print('\nEvaluation train') checkpoint = torch.load(args.save_path + '/' + 'model_best.pth.tar') model.load_state_dict(checkpoint['state_dict']) print('\nEvaluation test') test_loss, test_top1, test_top5 = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test top1: %.2f, Test top5' % (test_loss, test_top1, test_top5)) return if args.finetune: # Load checkpoint. print('==> finetune a pretrained model..') print(args.model_path) assert os.path.isfile(args.model_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint['state_dict'], strict=False) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') print(args.save_path + '/' + 'checkpoint.pth.tar') assert os.path.isfile(args.save_path + '/' + 'checkpoint.pth.tar'), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.save_path + '/' + 'checkpoint.pth.tar') best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] # recover learning rate for epoch in args.schedule: if start_epoch > epoch: state['lr'] *= args.gamma model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.save_path, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.save_path, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train top1', 'Train top5', 'Valid top1.', 'Valid top5']) # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_top1, train_top5 = train(trainloader, model, criterion, optimizer, epoch, use_cuda, args) test_loss, test_top1, test_top5 = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_top1, train_top5, test_top1, test_top5]) print(args.save_path) # save model is_best = test_top1 > best_acc best_acc = max(test_top1, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_top1, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, save_path=args.save_path) logger.close() print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) #158 dd #My dataset create ############################################## def make_datapath_list(phase="train"): rootpath = "./dataset/CUB-200-2011_rename_fixations" #rootpath = "./" target_path = os.path.join(rootpath, phase, "**", "*.jpg") # 最初の**はクラスのディレクトリ path_list = [] # globを利用してサブディレクトリまでファイルパスを格納 for path in glob.glob( "./dataset/CUB-200-2011_rename_fixations/*/*.jpg"): #階層によって変わる #for path in glob.glob("./dataset/CUB-200-2011_rename_fixations/*.jpg"): path_list.append(path) return path_list #001.Black_footed_Albatross class ImageTransform(): def __init__(self): self.data_transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomVerticalFlip(p=0.5), transforms.ToTensor(), transforms.Normalize((0.1948, 0.2155, 0.1589), (0.2333, 0.2278, 0.26106)) ]) def __call__(self, img): return self.data_transform(img) ############################################ class Dataset(data.Dataset): def __init__(self, file_list, transform=None): self.file_list = file_list self.file_transform = transform # 前処理クラスのインスタンス def __len__(self): return len(self.file_list) def __getitem__(self, index): img_path = self.file_list[index] # index番目のpath img = Image.open(img_path) # index番目の画像ロード #import pdb;pdb.set_trace() img_transformed = self.file_transform(img) # 前処理クラスでtensorに変換 #import pdb;pdb.set_trace label = img_path.split("/")[3] # リストの値を階層によって変える 3 label = label.split(".")[0] #.以下を削除 label = int(label) #ラベルを数値にした #import pdb;pdb.set_trace #label = convert(label) # クラスを表すディレクトリ文字列から数値に変更 ################################ csv_path = img_path.split("/")[4] csv_path = csv_path.split(".")[0] csv_path = inclusive_index(csv_list, csv_path) #return img_transformed, label, img_path, csv_path return img_transformed, label, csv_path trainval_dataset = Dataset(file_list=make_datapath_list(phase="train"), transform=ImageTransform()) csv_list = glob.glob( "./dataset/CUB-200-2011_rename_fixations/*/*_fixtaions.csv") #csv_list = glob.glob("./dataset/CUB-200-2011_rename_fixations/001.Black_footed_Albatross/*_fixtaions.csv") n_train = int(len(csv_list) * 0.7) n_val = len(csv_list) - n_train def inclusive_index(lst, purpose): for i, e in enumerate(lst): if purpose in e: return i raise IndexError batch_size = 8 #train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) #trainとvalに分ける train_dataset, val_dataset = torch.utils.data.random_split( trainval_dataset, [n_train, n_val]) #trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True) testloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True) # Model print("==> creating model '{}'".format(args.arch)) model = vgg_11.vgg11( ) #モデルの呼び出し、今回は簡易版のため畳み込み3層、convlstm cell1層、全結合から構成される model.load_state_dict( torch.load("./models/cifar/pretrain/vgg11-bbd30ac9.pth"), strict=False) #import pdb;pdb.set_trace() #model.load_state_dict(torch.load("./models/cifar/pretrain/vgg19-dcbb9e9d.pth"), strict=False) model = torch.nn.DataParallel(model).to(device=device) cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) #import pdb;pdb.set_trace() test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) #is_bets = 0 #bets_acc = 0 ######################################### # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc, state start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) # if args.dataset == 'cifar10': # dataloader = datasets.CIFAR10 # num_classes = 10 # else: # dataloader = datasets.CIFAR100 # num_classes = 100 # trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) # trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) # testset = dataloader(root='./data', train=False, download=False, transform=transform_test) # testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) import cifar_custom if args.dataset == 'cifar10': dataloader = cifar_custom.CIFAR10 num_classes = 10 else: dataloader = cifar_custom.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train, imbalance=True) ############## oversample minority classes ################## import numpy as np targets = trainset.train_labels class_count = np.unique(targets, return_counts=True)[1] weight = 1. / class_count sample_weights = torch.from_numpy(weight[list(map(int, targets))]) sampler = torch.utils.data.sampler.WeightedRandomSampler( sample_weights, len(sample_weights)) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, num_workers=args.workers, sampler=sampler) ############################################################## # trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) ################################ import math class GLayer(nn.Module): def __init__(self, num_classes=10, feat_dim=2, use_gpu=True, scale=1): super(GLayer, self).__init__() self.centers = nn.Parameter(torch.Tensor( num_classes, feat_dim, )) self.reset_parameters() self.sigma = 10 self.scale = scale self.bn = nn.BatchNorm1d(feat_dim) def reset_parameters(self): stdv = 1. / math.sqrt(self.centers.size(1)) self.centers.data.uniform_(-stdv, stdv) def forward(self, x): # x = self.bn(x) x_norm = (x**2).sum(1).view(-1, 1) y = self.centers y_t = torch.transpose(y, 0, 1) y_norm = (y**2).sum(1).view(1, -1) dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t) dist = torch.clamp(dist, 0.0, 1e2) # return 1.0/(1.0+dist) sim = self.scale * torch.exp(-dist / self.sigma) return sim class FullModel(nn.Module): def __init__(self, base_model, g_layer, linear_layer): super(FullModel, self).__init__() self.base = base_model # self.lin = linear_layer self.gl = g_layer def forward(self, x): x = self.base(x) x = x.view(x.size(0), -1) return self.gl(x) base_model = nn.Sequential(*list(model.children())[:-1]) head = GLayer(num_classes=num_classes, feat_dim=342).cuda() # model=nn.Sequential(model,HeadNet) linear_layer = nn.Linear(342, num_classes).cuda() model = FullModel(base_model, head, linear_layer) ################################ model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) ################################ class HingeLossSim(nn.Module): def __init__(self, num_classes=10, use_gpu=True, margin=1): super(HingeLossSim, self).__init__() self.num_classes = num_classes self.use_gpu = use_gpu self.margin = margin def forward(self, x, labels): batch_size = x.size(0) classes = torch.arange(self.num_classes).long() if self.use_gpu: classes = classes.cuda() labels = labels.unsqueeze(1).expand(batch_size, self.num_classes) mask = labels.eq(classes.expand(batch_size, self.num_classes)) neg = x[~mask].view(batch_size, -1) pos = x[mask].unsqueeze(1).expand(neg.size()) # return torch.mean(torch.relu(neg-pos+self.margin)) return torch.mean(torch.relu(neg - pos + self.margin)) ################################ criterion = nn.CrossEntropyLoss() # criterion_xent = nn.CrossEntropyLoss() criterion_xent = nn.MultiMarginLoss(p=2, margin=1) # criterion_xent = HingeLossSim() optimizer_model = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum) # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer_model.load_state_dict(checkpoint['optimizer_model']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion_xent, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer_model, epoch) print('\nEpoch: [%d | %d] LR: %f ' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion_xent, criterion, optimizer_model, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion_xent, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer_model': optimizer_model.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if args.data_augmentation: transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), RandomVerticalFlip(), transforms.Scale(256), transforms.RandomSizedCrop(224), ColorJitter(0.5, 0.5, 0.5), RandomRotation(30), RandomAffine(30), transforms.ToTensor(), normalize ]) else: transform_train = transforms.Compose([ transforms.RandomSizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) #trainset = Dataset(root=args.root, filename=args.train_list, transform=transform_train) trainset = ImageWeightSampling(args.root, args.train_list, transform=transform_train) testset = Dataset(root=args.root_val, filename=args.val_list, transform=transforms.Compose([ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ])) #sampler = torch.utils.data.sampler.WeightedRandomSampler(trainset.sample_weights, len(trainset.sample_weights)) #train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.train_epoch,sampler=sampler, num_workers=args.workers, pin_memory=True) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.train_epoch, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(testset, batch_size=args.test_epoch, shuffle=False, num_workers=args.workers, pin_memory=True) #create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) #check if pretrained model exists pretrained_filename = os.path.join('./pretrained', args.arch) assert os.path.isfile( pretrained_filename ), 'Error: no pretrained checkpoint directory found!' model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes) elif args.arch.startswith('resnext') or args.arch.startswith('se_resnext'): print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](baseWidth=args.base_width, cardinality=args.cardinality, num_classes=args.num_classes, attention=args.attention) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=args.num_classes) if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): models.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if args.weight_loss: assert os.path.isfile( args.weight_file ), 'weight loss training while weight file not found' weights = json.load(open(args.weight_file)) weight_ = [] for i in range(len(weights.keys())): weight_.append(weights[str(i)]) weight_ = np.array(weight_, dtype=np.float32) weight_ = weight_ / sum(weight_) * (args.num_classes + 0.0) * 0.5 + 0.5 print(weight_) weight_ = torch.from_numpy(weight_) if args.weight_loss: #criterion = nn.CrossEntropyLoss(weight=weight_,reduce=False).cuda() criterion = nn.CrossEntropyLoss(weight=weight_).cuda() elif args.mix_loss: center = np.loadtxt('center.txt') criterion = MixLoss(torch.from_numpy(center).cuda()).cuda() else: criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #Resume title = 'Imagenet-' + args.arch if args.resume: print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) #append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) #save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint(epoch, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc: ') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root=args.data_root, train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root=args.data_root, train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.save_dir): mkdir_p(args.save_dir) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) model_ref = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) model_ref = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) model_ref = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) model_ref = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) model_ref = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() model_ref = torch.nn.DataParallel(model_ref).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # default is 0.001 # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Getting reference model from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' checkpoint = torch.load(args.resume) model_ref.load_state_dict(checkpoint['state_dict']) logger = Logger(os.path.join(args.save_dir, 'log_scratch.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) # set some weights to zero, according to model_ref --------------------------------- for m, m_ref in zip(model.modules(), model_ref.modules()): if isinstance(m, nn.Conv2d): weight_copy = m_ref.weight.data.abs().clone() mask = weight_copy.gt(0).float().cuda() n = mask.sum() / float(m.in_channels) m.weight.data.normal_(0, math.sqrt(2. / n)) m.weight.data.mul_(mask) # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) num_parameters = get_conv_zero_param(model) print('Zero parameters: {}'.format(num_parameters)) num_parameters = sum( [param.nelement() for param in model.parameters()]) print('Parameters: {}'.format(num_parameters)) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.save_dir) logger.close() print('Best acc:') print(best_acc)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomSizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Scale(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # append logger file logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def main(): global best_top1, best_top5 args.world_size = 1 start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.train_batch, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.test_batch, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup = 0) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lsadam': optimizer = LSAdamW(model.parameters(), lr=args.lr*((1.+4.*args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'lsradam': sigma = 0.1 optimizer = LSRAdam(model.parameters(), lr=args.lr*((1.+4.*args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'srsgd': iter_count = 1 optimizer = SGD_Adaptive(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradam': iter_count = 1 optimizer = SRNAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradamw': iter_count = 1 optimizer = SRAdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'srradam': #NOTE: need to double-check this iter_count = 1 optimizer = SRRAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[0]) # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_top1 = checkpoint['best_top1'] best_top5 = checkpoint['best_top5'] start_epoch = checkpoint['epoch'] - 1 model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) iter_count = optimizer.param_groups[0]['iter_count'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1', 'Valid Top1', 'Train Top5', 'Valid Top5']) logger.file.write(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) if args.evaluate: logger.file.write('\nEvaluation only') test_loss, test_top1, test_top5 = test(val_loader, model, criterion, start_epoch, use_cuda, logger) logger.file.write(' Test Loss: %.8f, Test Top1: %.2f, Test Top5: %.2f' % (test_loss, test_top1, test_top5)) return # Train and val schedule_index = 1 for epoch in range(start_epoch, args.epochs): if args.optimizer.lower() == 'srsgd': if epoch in args.schedule: optimizer = SGD_Adaptive(model.parameters(), lr=args.lr * (args.gamma**schedule_index), weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradam': if epoch in args.schedule: optimizer = SRNAdam(model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradamw': if epoch in args.schedule: optimizer = SRAdamW(model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'srradam': if epoch in args.schedule: optimizer = SRRAdam(model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 else: adjust_learning_rate(optimizer, epoch) logger.file.write('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower() == 'sradam' or args.optimizer.lower() == 'sradamw' or args.optimizer.lower() == 'srradam': train_loss, train_top1, train_top5, iter_count = train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger) else: train_loss, train_top1, train_top5 = train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger) test_loss, test_top1, test_top5 = test(val_loader, model, criterion, epoch, use_cuda, logger) # append logger file logger.append([state['lr'], train_loss, test_loss, train_top1, test_top1, train_top5, test_top5]) writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch) writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch) writer.add_scalars('train_top1', {args.model_name: train_top1}, epoch) writer.add_scalars('test_top1', {args.model_name: test_top1}, epoch) writer.add_scalars('train_top5', {args.model_name: train_top5}, epoch) writer.add_scalars('test_top5', {args.model_name: test_top5}, epoch) # save model is_best = test_top1 > best_top1 best_top1 = max(test_top1, best_top1) best_top5 = max(test_top5, best_top5) save_checkpoint({ 'epoch': epoch + 1, 'schedule_index': schedule_index, 'state_dict': model.state_dict(), 'top1': test_top1, 'top5': test_top5, 'best_top1': best_top1, 'best_top5': best_top5, 'optimizer' : optimizer.state_dict(), }, is_best, epoch, checkpoint=args.checkpoint) # reset DALI iterators train_loader.reset() val_loader.reset() logger.file.write('Best top1: %f'%best_top1) logger.file.write('Best top5: %f'%best_top5) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best top1: %f'%best_top1) print('Best top5: %f'%best_top5) with open("./all_results_imagenet.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n"%args.checkpoint) f.write("best_top1 %f, best_top5 %f\n\n"%(best_top1,best_top5)) fcntl.flock(f, fcntl.LOCK_UN)