def migrate_models(model, target_model, best_epoch, model_name='marvis_mobilenet_multi_gpu'): """ This code snnipet is meant to adapt pre-trained model to a new model containing buffers """ module_list = [ m for m in list(model.modules()) if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.BatchNorm2d) ] if args.gpus is not None: target_model = torch.nn.DataParallel(target_model, args.gpus) target_module_list = [ m for m in list(target_model.modules()) if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.BatchNorm2d) ] for idx, m in enumerate(module_list): for p in m._parameters: if m._parameters[p] is not None: target_module_list[idx]._parameters[p].data = m._parameters[ p].data.clone() for b in m._buffers: # For batchnorm stats if m._buffers[b] is not None: target_module_list[idx]._buffers[b].data = m._buffers[ b].data.clone() save_dir = os.path.join('./trained_models', model_name) if not os.path.isdir(save_dir): os.mkdir(save_dir) save_checkpoint( { 'epoch': best_epoch, 'model': args.model, 'config': args.model_config, 'state_dict': target_model.state_dict(), 'best_prec1': best_epoch }, True, path=save_dir)
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '') results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model logging.info("creating model %s", args.model) model = models.__dict__[args.model] model_config = {'input_size': args.input_size, 'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } transform = getattr(model, 'input_transform', default_transform) regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) # define loss function (criterion) and optimizer criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)() criterion.to(args.device, dtype) model.to(args.device, dtype) val_data = get_dataset(args.dataset, 'val', transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, 0) return train_data = get_dataset(args.dataset, 'train', transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) optimizer = OptimRegime(model.parameters(), regime) logging.info('training regime: %s', regime) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_prec1, train_prec5 = train(train_loader, model, criterion, epoch, optimizer) # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'regime': regime }, is_best, path=save_path) logging.info('\n Epoch: {0}\t' 'Training Loss {train_loss:.4f} \t' 'Training Prec@1 {train_prec1:.3f} \t' 'Training Prec@5 {train_prec5:.3f} \t' 'Validation Loss {val_loss:.4f} \t' 'Validation Prec@1 {val_prec1:.3f} \t' 'Validation Prec@5 {val_prec5:.3f} \n'.format( epoch + 1, train_loss=train_loss, val_loss=val_loss, train_prec1=train_prec1, val_prec1=val_prec1, train_prec5=train_prec5, val_prec5=val_prec5)) results.add(epoch=epoch + 1, train_loss=train_loss, val_loss=val_loss, train_error1=100 - train_prec1, val_error1=100 - val_prec1, train_error5=100 - train_prec5, val_error5=100 - val_prec5) results.plot(x='epoch', y=['train_loss', 'val_loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['train_error1', 'val_error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['train_error5', 'val_error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') results.save()
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not os.path.exists(save_path) and not (args.distributed and args.local_rank > 0): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, grad_clip=args.grad_clip, print_freq=args.print_freq, adapt_grad_norm=args.adapt_grad_norm) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data = DataRegime(getattr(model, 'data_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None }) logging.info('optimization regime: %s', optim_regime) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), duplicates=train_data.get('duplicates'), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1 }, is_best, path=save_path) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] # if not (args.distributed and args.local_rank > 0): if not path.exists(save_path): makedirs(save_path) dump_args(args, path.join(save_path, 'args.txt')) setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=False) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # All parameters to the model should be passed via this dict. model_config = { 'dataset': args.dataset, 'dp_type': args.dropout_type, 'dp_percentage': args.dropout_perc, 'dropout': args.drop_rate, 'device': args.device } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) # create Resnet model model = resnet(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # # optionally resume from a checkpoint # if args.evaluate: # if not path.isfile(args.evaluate): # parser.error('invalid checkpoint: {}'.format(args.evaluate)) # checkpoint = torch.load(args.evaluate, map_location="cpu") # # Overrride configuration with checkpoint info # args.model = checkpoint.get('model', args.model) # args.model_config = checkpoint.get('config', args.model_config) # # load checkpoint # model.load_state_dict(checkpoint['state_dict']) # logging.info("loaded checkpoint '%s' (epoch %s)", # args.evaluate, checkpoint['epoch']) # # if args.resume: # checkpoint_file = args.resume # if path.isdir(checkpoint_file): # results.load(path.join(checkpoint_file, 'results.csv')) # checkpoint_file = path.join( # checkpoint_file, 'model_best.pth.tar') # if path.isfile(checkpoint_file): # logging.info("loading checkpoint '%s'", args.resume) # checkpoint = torch.load(checkpoint_file, map_location="cpu") # if args.start_epoch < 0: # not explicitly set # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] # model.load_state_dict(checkpoint['state_dict']) # optim_state_dict = checkpoint.get('optim_state_dict', None) # logging.info("loaded checkpoint '%s' (epoch %s)", # checkpoint_file, checkpoint['epoch']) # else: # logging.error("no checkpoint found at '%s'", args.resume) # else: # optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) # if optim_state_dict is not None: # optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, loss_scale=args.loss_scale, grad_clip=args.grad_clip, print_freq=args.print_freq, adapt_grad_norm=args.adapt_grad_norm) if args.tensorwatch: trainer.set_watcher(filename=path.abspath( path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) # # save weights heatmap # w = model._modules['layer3']._modules['5']._modules['conv2']._parameters['weight'].view(64, -1).cpu().detach().numpy() # heat_maps_dir = 'C:\\Users\\Pavel\\Desktop\\targeted_dropout_pytorch\\pics\\experiment_0' # plot = sns.heatmap(w, center=0) # name = str(datetime.now()).replace(':', '_').replace('-', '_').replace('.', '_').replace(' ', '_') + '.png' # plot.get_figure().savefig(path.join(heat_maps_dir, name)) # plt.clf() if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=False) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main_worker(args, ml_logger): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not path.exists(args.save_path): makedirs(args.save_path) export_args_namespace(args, path.join(args.save_path, 'config.json')) setup_logging(path.join(args.save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(args.save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", args.save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join(checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer( model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, ) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) if 'zeroBN' in model_config: #hot start num_steps = int(len(train_data.get_loader()) * 0.5) trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch, num_steps=num_steps) for m in model.modules(): if isinstance(m, ZeroBN): m.max_sparsity = args.max_sparsity m.max_cos_sim = args.max_cos_sim if args.preserve_cosine: if args.layers_cos_sim1 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim1 if args.layers_cos_sim2 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim2 if args.layers_cos_sim3 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim3 if args.min_cos_sim: if args.layers_min_cos_sim1 in m.fullName: m.min_cos_sim = args.min_cos_sim m.cos_sim_min = args.cos_sim_min1 if args.layers_min_cos_sim2 in m.fullName: m.min_cos_sim = args.min_cos_sim m.cos_sim_min = args.cos_sim_min2 for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) if 'zeroBN' in model_config: trainer.collectStat(train_data.get_loader(), num_steps=1, prunRatio=args.stochasticPrunning, cos_sim=args.cos_sim, cos_sim_max=args.cos_sim_max) trainer.collectStat(train_data.get_loader(), num_steps=1, prunRatio=args.stochasticPrunning, cos_sim=args.cos_sim, cos_sim_max=args.cos_sim_max) # torch.cuda.empty_cache() train_results = trainer.train(train_data.get_loader(), ml_logger, chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) ml_logger.log_metric('Val Acc1', val_results['prec1'], step='auto') # torch.cuda.empty_cache() if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=args.save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main(): global args, best_psnr args = parser.parse_args() # massage args block_opts = [] block_opts = args.block_opts block_opts.append(args.block_overlap) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log_%s.txt' % time_stamp)) results_file = os.path.join(save_path, 'results.%s') results = ResultsLog(results_file % 'csv', results_file % 'html') logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.encoder_lr > 0: encoder_learn = True else: encoder_learn = False # create model if args.pretrained_net is not None: logging.info("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch]( block_opts, pretrained=args.pretrained_net, mask_path=args.mask_path, mean=args.mean, std=args.std, noise=args.noise, encoder_learn=encoder_learn, p=args.bernoulli_p, K=args.layers_k) else: logging.info("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]( block_opts, mask_path=args.mask_path, mean=args.mean, std=args.std, noise=args.noise, encoder_learn=encoder_learn, p=args.bernoulli_p, K=args.layers_k) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() # define loss function (criterion) and optimizer mseloss = loss.EuclideanDistance(args.batch_size) # annual scedule if encoder_learn: optimizer = torch.optim.SGD([ {'params': model.module.measurements.parameters(), 'lr': args.encoder_lr}, {'params': model.module.reconstruction.parameters()}], args.decoder_lr, momentum=args.momentum, weight_decay=args.weight_decay) def lambda1(epoch): return 0.0 if epoch >= args.encoder_annual[2] else ( args.encoder_annual[0] ** bisect_right(range(args.encoder_annual[1], args.encoder_annual[2], args.encoder_annual[1]), epoch)) def lambda2( epoch): return args.decoder_annual[0] ** bisect_right([args.decoder_annual[1]], epoch) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=[lambda1, lambda2]) else: optimizer = torch.optim.SGD([ {'params': model.module.reconstruction.parameters()}], args.decoder_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[args.decoder_annual[1]], gamma=args.decoder_annual[0]) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): logging.info("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_psnr = checkpoint['best_psnr'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logging.info("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: logging.info("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code train_loader = torch.utils.data.DataLoader( datasets.videocs.VideoCS(args.data_train, args.block_opts, transforms.Compose([ transforms.ToTensor(), ]), hdf5=args.hdf5), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.videocs.VideoCS(args.data_val, args.block_opts, transforms.Compose([ transforms.ToTensor(), ]), hdf5=False), batch_size=1, shuffle=False, num_workers=0, pin_memory=True) # Save initial mask if encoder_learn: initial_weights = binarization( model.module.measurements.weight.clone()) perc_1 = initial_weights.mean().cpu().data.numpy()[0] logging.info('Percentage of 1: {}'.format(perc_1)) np.save(save_path + '/initial_mask.npy', model.module.measurements.weight.clone()) else: # binarize weights model.module.measurements.binarization() perc_1 = model.module.measurements.weight.clone().mean().cpu().data.numpy()[ 0] logging.info('Percentage of 1: {}'.format(perc_1)) # perform first validation validate(val_loader, model, encoder_learn) for epoch in range(args.start_epoch, args.epochs): # Annual schedule enforcement scheduler.step() logging.info(scheduler.get_lr()) if encoder_learn: save_binary_weights_before = binarization( model.module.measurements.weight.clone()) # train for one epoch train_loss = train(train_loader, model, optimizer, epoch, mseloss, encoder_learn, args.gradient_clipping) if encoder_learn: save_binary_weights_after = binarization( model.module.measurements.weight.clone()) diff = np.int(torch.abs(save_binary_weights_after - save_binary_weights_before).sum().cpu().data.numpy()) perc_1 = save_binary_weights_after.mean().cpu().data.numpy()[0] logging.info( 'Binary Weights Changed: {} - Percentage of 1: {}'.format(diff, perc_1)) else: perc1 = model.module.measurements.weight.clone().mean().cpu().data.numpy()[0] logging.info('Percentage of 1: {}'.format(perc_1)) # evaluate on validation set psnr = validate(val_loader, model, encoder_learn) # remember best psnr and save checkpoint is_best = psnr > best_psnr best_psnr = max(psnr, best_psnr) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_psnr': best_psnr, 'optimizer': optimizer.state_dict(), }, is_best, path=save_path) results_add(epoch, results, train_loss, psnr) if encoder_learn: model.module.measurements.restore()
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) else: print('***************************************\n' 'Warning: PATH exists - override warning\n' '***************************************') args.distributed = args.local_rank >= 0 or args.world_size > 1 setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) if args.deterministic: logging.info('Deterministic Run Set') torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) if args.distributed: args.device_ids = [args.local_rank] dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model set_global_seeds(args.seed) model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint shards = None x = None checkpoint = None if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) x = dict() for name, val in checkpoint['server_state_dict'].items(): x[name[7:]] = val model.load_state_dict(x) shards = checkpoint['server_weight_shards'] logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] # model_dict = {'.'.join(k.split('.')[1:]): v for k, v in checkpoint['server_state_dict'].items()} # model.load_state_dict(model_dict) model.load_state_dict(checkpoint['server_state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) shards = checkpoint['server_weight_shards'] else: logging.error("no checkpoint found at '%s'", args.resume) # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) cpu_store = True if args.dataset == 'imagenet' and args.workers_num > 32 else False args.server = args.server if args.delay > 0 else 'ssgd' server = ParameterServer.get_server(args.server, args.delay, model=model, shards=shards, optimizer_regime=optim_regime, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, grad_clip=args.grad_clip, workers_num=args.workers_num, cpu_store=cpu_store) del shards, x, checkpoint torch.cuda.empty_cache() trainer = Trainer(model, server, criterion, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, workers_number=args.workers_num, grad_clip=args.grad_clip, print_freq=args.print_freq, schedule=args.schedule) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True }) # Training Data loading code train_data = DataRegime(getattr(model, 'data_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': args.augment, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None }) if args.evaluate: trainer.forward_pass(train_data.get_loader(), duplicates=args.duplicates) results = trainer.validate(val_data.get_loader()) logging.info(results) return logging.info('optimization regime: %s', optim_regime) trainer.training_steps = args.start_epoch * len(train_data) args.iterations_steps = trainer.training_steps with open(os.path.join(save_path, 'args.txt'), 'w') as file: file.write(dict_to_table(vars(args))) tb.init(path=save_path, title='Training Results', params=args, res_iterations=args.resolution) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), duplicates=args.duplicates) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if (epoch + 1) % args.save_freq == 0: tb.tboard.set_resume_step(epoch) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'server_state_dict': server._model.state_dict(), 'server_weight_shards': server._shards_weights, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, path=save_path) errors = { 'error1_train': 100 - train_results['prec1'], 'error5_train': 100 - train_results['prec5'], 'error1_val': 100 - val_results['prec1'], 'error5_val': 100 - val_results['prec5'], 'epochs': epoch } logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Error@1 {errors[error1_train]:.3f} \t' 'Training Error@5 {errors[error5_train]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Error@1 {errors[error1_val]:.3f} \t' 'Validation Error@5 {errors[error5_val]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results, errors=errors)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) tb.tboard.log_results(epoch, **values) tb.tboard.log_model(server, epoch) if args.delay > 0: tb.tboard.log_delay(trainer.delay_hist, epoch) tb.tboard.close() return errors, args
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' model_config = {'dataset': args.dataset, 'batch': args.batch_size} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) ##autoname fname = auto_name(args, model_config) args.save = fname monitor = args.monitor print(fname) save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not args.dry: if not path.exists(save_path): makedirs(save_path) export_args_namespace(args, path.join(save_path, 'config.json')) if monitor > 0 and not args.dry: events_path = "runs/%s" % fname my_file = Path(events_path) if my_file.is_file(): os.remove(events_path) writer = SummaryWriter(log_dir=events_path ,comment=str(args)) model_config['writer'] = writer model_config['monitor'] = monitor else: monitor = 0 writer = None if args.dry: model = models.__dict__[args.model] model = model(**model_config) print("created model with configuration: %s" % model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) print("number of parameters: %d" % num_parameters) return setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model = model(**model_config) if args.sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location="cpu") # Overrride configuration with checkpoint info args.model = checkpoint.get('model', args.model) args.model_config = checkpoint.get('config', args.model_config) # load checkpoint model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join( checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay}]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, writer = writer, monitor = monitor) if args.tensorwatch: trainer.set_watcher(filename=path.abspath(path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False}) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = {'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': {'holes': 1, 'length': 16} if args.cutout else None} if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime( getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) if not args.covmat == "": try: int_covmat = int(args.covmat) if int_covmat < 0: total_layers = len([name for name, layer in model.named_children()]) int_covmat = total_layers + int_covmat child_cnt = 0 except ValueError: int_covmat = None def calc_covmat(x_, partitions = 64): L = x_.shape[0] // partitions non_diags = [] diags = [] for p1 in range(partitions): for p2 in range(partitions): x = x_[p1*L:(p1+1)*L] y = x_[p2*L:(p2+1)*L] X = torch.matmul(x,y.transpose(0,1)) if p1 == p2: mask = torch.eye(X.shape[0],dtype=torch.bool) non_diag = X[~mask].reshape(-1).cpu() diag = X[mask].reshape(-1).cpu() non_diags.append(non_diag) diags.append(diag) else: non_diag = X.reshape(-1).cpu() non_diags.append(diag) diags = torch.cat(diags) non_diags = torch.cat(non_diags) diag_var = diags.var() non_diag_var = non_diags.var() diags = diags - diags.mean() non_diags = non_diags - non_diags.mean() ##import pdb; pdb.set_trace() diag_small_ratio = (diags < -diags.std()).to(dtype = torch.float).mean() non_diag_small_ratio = (non_diags < -non_diags.std()).to(dtype = torch.float).mean() return diag_var, non_diag_var, diag_small_ratio, non_diag_small_ratio global diag_var_mean global non_diag_var_mean global var_count var_count = 0 diag_var_mean = 0 non_diag_var_mean = 0 def report_covmat_hook(module, input, output): global diag_var_mean global non_diag_var_mean global var_count flatten_output = output.reshape([-1,1]).detach() diag_var, non_diag_var, diag_small_ratio, non_diag_small_ratio = calc_covmat(flatten_output) diag_var_mean = diag_var_mean + diag_var non_diag_var_mean = non_diag_var_mean + non_diag_var var_count = var_count + 1 if var_count % 10 == 1: print("diag_var = %.02f (%.02f), ratio: %.02f , non_diag_var = %0.2f (%.02f), ratio: %.02f" % (diag_var, diag_var_mean/var_count, diag_small_ratio , non_diag_var, non_diag_var_mean/var_count, non_diag_small_ratio )) for name, layer in model.named_children(): if int_covmat is None: condition = (name == args.covmat) else: condition = (child_cnt == int_covmat) child_cnt = child_cnt + 1 if condition: layer.register_forward_hook( report_covmat_hook) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint({ 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n' .format(epoch + 1, train=train_results, val=val_results)) if writer is not None: writer.add_scalar('Train/Loss', train_results['loss'], epoch) writer.add_scalar('Train/Prec@1', train_results['prec1'], epoch) writer.add_scalar('Train/Prec@5', train_results['prec5'], epoch) writer.add_scalar('Val/Loss', val_results['loss'], epoch) writer.add_scalar('Val/Prec@1', val_results['prec1'], epoch) writer.add_scalar('Val/Prec@5', val_results['prec5'], epoch) # tmplr = optimizer.get_lr() # writer.add_scalar('HyperParameters/learning-rate', tmplr, epoch) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save() logging.info(f'\nBest Validation Accuracy (top1): {best_prec1}') if writer: writer.close()
def main(): torch.manual_seed(1) torch.cuda.manual_seed_all(1) global args, best_prec1 best_prec1 = 0 args = parser.parse_args() time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) args.noise = not args.no_noise args.quant = not args.no_quantization args.act_quant = not args.no_act_quantization args.quant_edges = not args.no_quant_edges logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] device = 'cuda:' + str(args.gpus[0]) cudnn.benchmark = True else: device = 'cpu' dtype = torch.float32 args.step_setup = None model = models.__dict__[args.model] model_config = { 'scale': args.scale, 'input_size': args.input_size, 'dataset': args.dataset, 'bitwidth': args.bitwidth, 'quantize': args.quant, 'noise': args.noise, 'step': args.step, 'depth': args.depth, 'act_bitwidth': args.act_bitwidth, 'act_quant': args.act_quant, 'quant_edges': args.quant_edges, 'step_setup': args.step_setup, 'quant_epoch_step': args.quant_epoch_step, 'quant_start_stage': args.quant_start_stage, 'normalize': args.no_pre_process_normalize, 'noise_mask': args.noise_mask } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) # create model model = model(**model_config) logging.info("creating model %s", args.model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("number of parameters: ", params) logging.info("created model with configuration: %s", model_config) print(model) data = None checkpoint_epoch = 0 # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location=device) load_model(model, checkpoint) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) print("loaded checkpoint {0} (epoch {1})".format( args.evaluate, checkpoint['epoch'])) elif args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=device) if not args.start_from_zero: args.start_epoch = checkpoint['epoch'] - 1 best_test = checkpoint['best_prec1'] checkpoint_epoch = checkpoint['epoch'] load_model(model, checkpoint) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) elif os.path.isdir(args.resume): checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar') csv_path = os.path.join(args.resume, 'results.csv') print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path, map_location=device) best_test = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_path, checkpoint['epoch'])) data = [] with open(csv_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.gpus is not None: model = torch.nn.DataParallel( model, [args.gpus[0]] ) # Statistics need to be calculated on single GPU to be consistant with data among multiplr GPUs # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True, integer_values=args.quant_dataloader, norm=not args.no_pre_process_normalize), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False, integer_values=args.quant_dataloader, norm=not args.no_pre_process_normalize) } transform = getattr(model.module, 'input_transform', default_transform) val_data = get_dataset(args.dataset, 'val', transform['eval'], datasets_path=args.datapath) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.val_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) train_data = get_dataset(args.dataset, 'train', transform['train'], datasets_path=args.datapath) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) statistics_train_loader = torch.utils.data.DataLoader( train_data, batch_size=args.act_stats_batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay, nesterov=True) model, criterion = model.to(device, dtype), criterion.to(device, dtype) if args.clr: scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode) else: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) csv_logger = CsvLogger(filepath=save_path, data=data) csv_logger.save_params(sys.argv, args) csv_logger_training_stats = os.path.join(save_path, 'training_stats.csv') # pre-training activation and parameters statistics calculation #### if check_if_need_to_collect_statistics(model): for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.pre_training_statistics = True # Turn on pre-training activation statistics calculation model.module.statistics_phase = True validate( statistics_train_loader, model, criterion, device, epoch=0, num_of_batches=80, stats_phase=True) # Run validation on training set for statistics model.module.quantize.get_act_max_value_from_pre_calc_stats( list(model.modules())) _ = model.module.quantize.set_weight_basis(list(model.modules()), None) for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.pre_training_statistics = False # Turn off pre-training activation statistics calculation model.module.statistics_phase = False else: # Maximal activation values still need to be derived from loaded stats model.module.quantize.assign_act_clamp_during_val(list( model.modules()), print_clamp_val=True) model.module.quantize.assign_weight_clamp_during_val( list(model.modules()), print_clamp_val=True) # model.module.quantize.get_act_max_value_from_pre_calc_stats(list(model.modules())) if args.gpus is not None: # Return to Multi-GPU after statistics calculations model = torch.nn.DataParallel(model.module, args.gpus) model, criterion = model.to(device, dtype), criterion.to(device, dtype) # pre-training activation statistics calculation #### if args.evaluate: val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, device, epoch=0) print("val_prec1: ", val_prec1) return # fast forward to curr stage for i in range(args.quant_start_stage): model.module.switch_stage(0) for epoch in trange(args.start_epoch, args.epochs + 1): if not isinstance(scheduler, CyclicLR): scheduler.step() # scheduler.optimizer = optimizer train_loss, train_prec1, train_prec5 = train( train_loader, model, criterion, device, epoch, optimizer, scheduler, training_stats_logger=csv_logger_training_stats) for layer in model.modules(): if isinstance(layer, actquant.ActQuantBuffers): layer.print_clamp() # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, device, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'layers_b_dict': model.module. layers_b_dict #TODO this doesn't work for multi gpu - need to del }, is_best, path=save_path) # New type of logging csv_logger.write({ 'epoch': epoch + 1, 'val_error1': 1 - val_prec1, 'val_error5': 1 - val_prec5, 'val_loss': val_loss, 'train_error1': 1 - train_prec1, 'train_error5': 1 - train_prec5, 'train_loss': train_loss }) csv_logger.plot_progress(title=args.model + str(args.depth)) csv_logger.write_text( 'Epoch {}: Best accuracy is {:.2f}% top-1'.format( epoch + 1, best_prec1 * 100.))
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not path.exists(save_path): makedirs(save_path) export_args_namespace(args, path.join(save_path, 'config.json')) setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) grad_stats_path = path.join(save_path, 'grad_stats') grad_stats = ResultsLog(grad_stats_path, title='collect grad stats - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) if args.enable_scheduler: model_config['fp8_dynamic'] = True if args.smart_loss_scale_only: model_config['smart_loss_scale_only'] = True if args.smart_loss_scale_and_exp_bits: model_config['smart_loss_scale_and_exp_bits'] = True model = model(**model_config) quantize_modules_name = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] fp8_scheduler = FP8TrainingScheduler( model, model_config, args, collect_stats_online=False, start_to_collect_stats_in_epoch=3, collect_stats_every_epochs=10, online_update=False, first_update_with_stats_from_epoch=4, start_online_update_in_epoch=3, update_every_epochs=1, update_loss_scale=True, update_exp_bit_width=args.smart_loss_scale_and_exp_bits, stats_path= "/data/moran/ConvNet_lowp_0/convNet.pytorch/results/2020-05-16_01-44-22/results.csv", # ResNet18- cifar10 # stats_path = "/data/moran/ConvNet_lowp_0/convNet.pytorch/results/2020-05-19_01-27-57/results.csv", # ResNet18- ImageNet quantize_modules_name=quantize_modules_name, enable_scheduler=False) if args.sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location="cpu") # Overrride configuration with checkpoint info args.model = checkpoint.get('model', args.model) args.model_config = checkpoint.get('config', args.model_config) # load checkpoint model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join(checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, enable_input_grad_statistics=True, exp_bits=args.exp_bits, fp_bits=args.fp_bits) if args.tensorwatch: trainer.set_watcher(filename=path.abspath( path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) fp8_scheduler.schedule_before_epoch(epoch) # train for one epoch # pdb.set_trace() train_results, meters_grad = trainer.train( train_data.get_loader(), chunk_batch=args.chunk_batch, scheduled_instructions=fp8_scheduler.scheduled_instructions) # evaluate on validation set if args.calibrate_bn: train_data = DataRegime(None, defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) trainer.calibrate_bn(train_data.get_loader(), num_steps=200) val_results, _ = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) values.update( {'grad mean ' + k: v['mean'].avg for k, v in meters_grad.items()}) values.update( {'grad std ' + k: v['std'].avg for k, v in meters_grad.items()}) results.add(**values) # stats was collected if fp8_scheduler.scheduled_instructions['collect_stat']: grad_stats_values = dict(epoch=epoch + 1) grad_stats_values.update({ 'grad mean ' + k: v['mean'].avg for k, v in meters_grad.items() }) grad_stats_values.update({ 'grad std ' + k: v['std'].avg for k, v in meters_grad.items() }) grad_stats.add(**grad_stats_values) fp8_scheduler.update_stats(grad_stats) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save() grad_stats.save()
def main(): global args, best_psnr args = parser.parse_args() # massage args block_opts = [] block_opts = args.block_opts block_opts.append(args.block_overlap) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.save == '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log_%s.txt' % time_stamp)) results_file = os.path.join(save_path, 'results.%s') results = ResultsLog(results_file % 'csv', results_file % 'html') logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.encoder_lr > 0: encoder_learn = True else: encoder_learn = False # create model if args.pretrained_net is not None: logging.info("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](block_opts, pretrained=args.pretrained_net, mask_path=args.mask_path, mean=args.mean, std=args.std, noise=args.noise, encoder_learn=encoder_learn, p=args.bernoulli_p, K=args.layers_k) else: logging.info("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](block_opts, mask_path=args.mask_path, mean=args.mean, std=args.std, noise=args.noise, encoder_learn=encoder_learn, p=args.bernoulli_p, K=args.layers_k) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() # define loss function (criterion) and optimizer mseloss = loss.EuclideanDistance(args.batch_size) # annual scedule if encoder_learn: optimizer = torch.optim.SGD( [{ 'params': model.module.measurements.parameters(), 'lr': args.encoder_lr }, { 'params': model.module.reconstruction.parameters() }], args.decoder_lr, momentum=args.momentum, weight_decay=args.weight_decay) def lambda1(epoch): return 0.0 if epoch >= args.encoder_annual[2] else ( args.encoder_annual[0]**bisect_right( range(args.encoder_annual[1], args.encoder_annual[2], args.encoder_annual[1]), epoch)) def lambda2(epoch): return args.decoder_annual[0]**bisect_right( [args.decoder_annual[1]], epoch) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=[lambda1, lambda2]) else: optimizer = torch.optim.SGD( [{ 'params': model.module.reconstruction.parameters() }], args.decoder_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[args.decoder_annual[1]], gamma=args.decoder_annual[0]) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): logging.info("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_psnr = checkpoint['best_psnr'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logging.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logging.info("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code train_loader = torch.utils.data.DataLoader(datasets.videocs.VideoCS( args.data_train, args.block_opts, transforms.Compose([ transforms.ToTensor(), ]), hdf5=args.hdf5), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(datasets.videocs.VideoCS( args.data_val, args.block_opts, transforms.Compose([ transforms.ToTensor(), ]), hdf5=False), batch_size=1, shuffle=False, num_workers=0, pin_memory=True) # Save initial mask if encoder_learn: initial_weights = binarization( model.module.measurements.weight.clone()) perc_1 = initial_weights.mean().cpu().data.numpy()[0] logging.info('Percentage of 1: {}'.format(perc_1)) np.save(save_path + '/initial_mask.npy', model.module.measurements.weight.clone()) else: # binarize weights model.module.measurements.binarization() perc_1 = model.module.measurements.weight.clone().mean().cpu().item() logging.info('Percentage of 1: {}'.format(perc_1)) # perform first validation validate(val_loader, model, encoder_learn) for epoch in range(args.start_epoch, args.epochs): logging.info(scheduler.get_last_lr()) if encoder_learn: save_binary_weights_before = binarization( model.module.measurements.weight.clone()) # train for one epoch train_loss = train(train_loader, model, optimizer, epoch, mseloss, encoder_learn, args.gradient_clipping) # Annual schedule enforcement scheduler.step() if encoder_learn: save_binary_weights_after = binarization( model.module.measurements.weight.clone()) diff = np.int( torch.abs(save_binary_weights_after - save_binary_weights_before).sum().cpu().data.numpy()) perc_1 = save_binary_weights_after.mean().cpu().item() logging.info( 'Binary Weights Changed: {} - Percentage of 1: {}'.format( diff, perc_1)) else: perc1 = model.module.measurements.weight.clone().mean().cpu().item( ) logging.info('Percentage of 1: {}'.format(perc_1)) # evaluate on validation set psnr = validate(val_loader, model, encoder_learn) # remember best psnr and save checkpoint is_best = psnr > best_psnr best_psnr = max(psnr, best_psnr) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_psnr': best_psnr, 'optimizer': optimizer.state_dict(), }, is_best, path=save_path) results_add(epoch, results, train_loss, psnr) if encoder_learn: model.module.measurements.restore()
def load_maybe_calibrate(checkpoint): try: model.load_state_dict(checkpoint) except BaseException as e: if model_config.get('quantize'): measure_name = '{}-{}.measure'.format(args.model, model_config['depth']) measure_path = os.path.join(save_path, measure_name) if os.path.exists(measure_path): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(measure_path) if 'state_dict' in checkpoint: best_prec1 = checkpoint['best_prec1'] checkpoint = checkpoint['state_dict'] logging.info( f"Measured checkpoint loaded, reference score top1 {best_prec1:.3f}" ) model.load_state_dict(checkpoint) else: if model_config.get('absorb_bn'): from utils.absorb_bn import search_absorbe_bn logging.info('absorbing batch normalization') model_config.update({ 'absorb_bn': False, 'quantize': False }) model_bn = model_builder(**model_config) model_bn.load_state_dict(checkpoint) search_absorbe_bn(model_bn, verbose=True) model_config.update({ 'absorb_bn': True, 'quantize': True }) checkpoint = model_bn.state_dict() model.load_state_dict(checkpoint, strict=False) logging.info("set model measure mode") # set_bn_is_train(model,False) set_measure_mode(model, True, logger=logging) logging.info( "calibrating apprentice model to get quant params") model.to(args.device, dtype) with torch.no_grad(): losses_avg, top1_avg, top5_avg = forward( val_loader, model, criterion, 0, training=False, optimizer=None) logging.info('Measured float resutls:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) set_measure_mode(model, False, logger=logging) # logging.info("test quant model accuracy") # losses_avg, top1_avg, top5_avg = validate(val_loader, model, criterion, 0) # logging.info('Quantized results:\nLoss {loss:.4f}\t' # 'Prec@1 {top1:.3f}\t' # 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) save_checkpoint( { 'epoch': 0, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': top1_avg, 'regime': regime }, True, path=save_path, save_all=True, filename=measure_name) else: raise e
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '') results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model logging.info("creating model %s", args.model) model_builder = models.__dict__[args.model] model_config = { 'input_size': args.input_size, 'dataset': args.dataset if args.dataset != 'imaginet' else 'imagenet' } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model_builder(**model_config) model.to(args.device, dtype) # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) transform = getattr(model, 'input_transform', default_transform) regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) # define loss function (criterion) and optimizer criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)() criterion.to(args.device, dtype) train_data = get_dataset(args.dataset, 'train', transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_data = get_dataset(args.dataset, 'val', transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) def load_maybe_calibrate(checkpoint): try: model.load_state_dict(checkpoint) except BaseException as e: if model_config.get('quantize'): measure_name = '{}-{}.measure'.format(args.model, model_config['depth']) measure_path = os.path.join(save_path, measure_name) if os.path.exists(measure_path): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(measure_path) if 'state_dict' in checkpoint: best_prec1 = checkpoint['best_prec1'] checkpoint = checkpoint['state_dict'] logging.info( f"Measured checkpoint loaded, reference score top1 {best_prec1:.3f}" ) model.load_state_dict(checkpoint) else: if model_config.get('absorb_bn'): from utils.absorb_bn import search_absorbe_bn logging.info('absorbing batch normalization') model_config.update({ 'absorb_bn': False, 'quantize': False }) model_bn = model_builder(**model_config) model_bn.load_state_dict(checkpoint) search_absorbe_bn(model_bn, verbose=True) model_config.update({ 'absorb_bn': True, 'quantize': True }) checkpoint = model_bn.state_dict() model.load_state_dict(checkpoint, strict=False) logging.info("set model measure mode") # set_bn_is_train(model,False) set_measure_mode(model, True, logger=logging) logging.info( "calibrating apprentice model to get quant params") model.to(args.device, dtype) with torch.no_grad(): losses_avg, top1_avg, top5_avg = forward( val_loader, model, criterion, 0, training=False, optimizer=None) logging.info('Measured float resutls:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) set_measure_mode(model, False, logger=logging) # logging.info("test quant model accuracy") # losses_avg, top1_avg, top5_avg = validate(val_loader, model, criterion, 0) # logging.info('Quantized results:\nLoss {loss:.4f}\t' # 'Prec@1 {top1:.3f}\t' # 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) save_checkpoint( { 'epoch': 0, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': top1_avg, 'regime': regime }, True, path=save_path, save_all=True, filename=measure_name) else: raise e # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) # model.load_state_dict(checkpoint['state_dict']) # logging.info("loaded checkpoint '%s' (epoch %s)", # args.evaluate, checkpoint['epoch']) load_maybe_calibrate(checkpoint) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) if 'state_dict' in checkpoint: if checkpoint['epoch'] > 0: args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] checkpoint = checkpoint['state_dict'] try: model.load_state_dict(checkpoint) except BaseException as e: if model_config.get('quantize'): if model_config.get('absorb_bn'): from utils.absorb_bn import search_absorbe_bn logging.info('absorbing batch normalization') model_config.update({ 'absorb_bn': False, 'quantize': False }) model_bn = model_builder(**model_config) model_bn.load_state_dict(checkpoint) search_absorbe_bn(model_bn, verbose=True) model_config.update({ 'absorb_bn': True, 'quantize': True }) checkpoint = model_bn.state_dict() model.load_state_dict(checkpoint, strict=False) model.to(args.device, dtype) logging.info("set model measure mode") # set_bn_is_train(model,False) set_measure_mode(model, True, logger=logging) logging.info( "calibrating apprentice model to get quant params") model.to(args.device, dtype) with torch.no_grad(): losses_avg, top1_avg, top5_avg = forward( val_loader, model, criterion, 0, training=False, optimizer=None) logging.info('Measured float resutls:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) set_measure_mode(model, False, logger=logging) logging.info("test quant model accuracy") losses_avg, top1_avg, top5_avg = validate( val_loader, model, criterion, 0) logging.info('Quantized results:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) save_checkpoint( { 'epoch': 0, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': top1_avg, 'regime': regime }, True, path=save_path, save_freq=5) #save_checkpoint(model.state_dict(), is_best=True, path=save_path, save_all=True) logging.info( f'overwriting quantization method with {args.q_method}' ) set_global_quantization_method(model, args.q_method) else: raise e logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, args.start_epoch) else: logging.error("no checkpoint found at '%s'", args.resume) if args.evaluate: if model_config.get('quantize'): logging.info( f'overwriting quantization method with {args.q_method}') set_global_quantization_method(model, args.q_method) losses_avg, top1_avg, top5_avg = validate(val_loader, model, criterion, 0) logging.info('Evaluation results:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) return optimizer = OptimRegime(model, regime) logging.info('training regime: %s', regime) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_prec1, train_prec5 = train(train_loader, model, criterion, epoch, optimizer) # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'regime': regime }, is_best, path=save_path) logging.info('\n Epoch: {0}\t' 'Training Loss {train_loss:.4f} \t' 'Training Prec@1 {train_prec1:.3f} \t' 'Training Prec@5 {train_prec5:.3f} \t' 'Validation Loss {val_loss:.4f} \t' 'Validation Prec@1 {val_prec1:.3f} \t' 'Validation Prec@5 {val_prec5:.3f} \n'.format( epoch + 1, train_loss=train_loss, val_loss=val_loss, train_prec1=train_prec1, val_prec1=val_prec1, train_prec5=train_prec5, val_prec5=val_prec5)) results.add(epoch=epoch + 1, train_loss=train_loss, val_loss=val_loss, train_error1=100 - train_prec1, val_error1=100 - val_prec1, train_error5=100 - train_prec5, val_error5=100 - val_prec5) results.plot(x='epoch', y=['train_loss', 'val_loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['train_error1', 'val_error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['train_error5', 'val_error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') results.save()