def main(): set_seeds(args.seed) check_dirs_exist([args.save_dir]) logger = Logger(args.log_path) device = get_device(args.dev_idx) if args.dataset not in dataset.__dict__: raise NameError if args.t_model not in models.__dict__: raise NameError if args.s_model not in models.__dict__: raise NameError train_loader, eval_loader, num_classes = dataset.__dict__[args.dataset]( args.batch_size) t_model = models.__dict__[args.t_model](num_classes=num_classes) s_model = models.__dict__[args.s_model](num_classes=num_classes) load_model(t_model, args.t_path, logger, device) load_model(s_model, args.s_path, logger, device) optimizer = optim.SGD(s_model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) base_trainer_cfg = (args, s_model, train_loader, eval_loader, optimizer, args.save_dir, device, logger) writer = SummaryWriter(log_dir=args.log_dir) # For tensorboardX trainer = PrunedModelTrainer(t_model, writer, *base_trainer_cfg) logger.log('\n'.join(map(str, vars(args).items()))) if args.evaluate: trainer.eval() else: trainer.train() trainer.eval()
def ERF_generate(self, dataset_name="testing", dataset_config="", model=None, extra_name=""): logger.info('ERF_generate on ({}) :'.format(dataset_name + extra_name)) if model is None: config = dict(self.config.model_config) config['stop_before_global_avg_pooling'] = True model = load_model(config, self.experiment) model.cuda() best_model_path = os.path.join(self.config.out_dir, "models", 'model_best_state.pth') checkpoint = torch.load(best_model_path) model.load_state_dict(checkpoint['state_dict']) # testing mode model.eval() loader = self.data_loaders[dataset_name] counter = 0 accum = None for step, (data, _, targets) in enumerate(loader): data = data.cuda() data.requires_grad = True outputs = model(data) grads = torch.zeros_like(outputs) grads[:, :, grads.size(2) // 2, grads.size(3) // 2] = 1 outputs.backward(grads) me = np.abs(data.grad.cpu().numpy()).mean(axis=0).mean(axis=0) if accum is None: accum = me else: accum += me counter += 1 torch.save({ "arr": accum, "counter": counter }, os.path.join(self.config.out_dir, 'ERF_dict.pth')) ERF_plot(accum, savefile=os.path.join(self.config.out_dir, 'erf.png')) self.experiment.add_artifact( os.path.join(self.config.out_dir, 'erf.png'), "erf.png", {"dataset": dataset_name}) return True
def __init__(self, config, seed=42): global logger logger = shared_globals.logger config = AttrDefault(lambda: None, config) self.config = config self.datasets = {} self.data_loaders = {} self.use_swa = config.use_swa #self.run.info['epoch'] = 0 # set random seed torch.manual_seed(seed) np.random.seed(seed + 1) random.seed(seed + 2) self.min_lr = self.config.optim_config["min_lr"] if self.min_lr is None: self.min_lr = 0.0 print(self.min_lr) # making outout dirs models_outputdir = os.path.join(config.out_dir, "models") if not os.path.exists(config.out_dir): os.makedirs(config.out_dir) if not os.path.exists(models_outputdir): os.makedirs(models_outputdir) #self.run.info['out_path'] = config.out_dir # init_loggers self.init_loggers() self.dataset_manager = DatasetsManager(self.config['audiodataset']) # init Tensor board if self.config.tensorboard: tensorboard_write_path = config.tensorboard_write_path if not tensorboard_write_path: tensorboard_write_path = self.config.out_dir.replace( "out", "runs", 1) shared_globals.console.info("tensorboard run path: " + tensorboard_write_path) shared_globals.console.info("To monitor this experiment use:\n " + shared_globals.bcolors.FAIL + "tensorboard --logdir " + tensorboard_write_path + shared_globals.bcolors.ENDC) #self.run.info['tensorboard_path'] = tensorboard_write_path self.writer = SummaryWriter(tensorboard_write_path) # init multi gpu self.bare_model = load_model(config.model_config) if self.use_swa: self.swa_model = load_model(config.model_config) if self.config.use_gpu: self.swa_model.cuda() self.swa_n = 0 self.swa_c_epochs = config.swa_c_epochs self.swa_start = config.swa_start if self.config.use_gpu: self.bare_model.cuda() shared_globals.console.info( "Trainable model parameters {}, non-trainable {} ".format( count_parameters(self.bare_model), count_parameters(self.bare_model, False))) # DataParallel mode if not config.parallel_mode: self.model = self.bare_model elif config.parallel_mode == "distributed": torch.distributed.init_process_group( backend='nccl', world_size=1, rank=0, init_method='file://' + config.out_dir + "/shared_file") self.model = torch.nn.parallel.DistributedDataParallel( self.bare_model) else: self.model = torch.nn.DataParallel(self.bare_model) # self.model.cuda() # if load_model if config.get('load_model'): load_model_path = config.get('load_model') load_model_path = os.path.expanduser(load_model_path) shared_globals.console.info("Loading model located at: " + load_model_path) checkpoint = torch.load(load_model_path) self.model.load_state_dict(checkpoint['state_dict']) if self.use_swa: swa_state_dict = checkpoint.get('swa_state_dict', None) self.swa_n = checkpoint.get('swa_n', 0) if (swa_state_dict is not None) and not self.config.swa_model_load_same: self.swa_model.load_state_dict(swa_state_dict) else: shared_globals.console.warning( "No swa_state_dict loaded! same loaded") self.swa_model.load_state_dict(checkpoint['state_dict']) self.swa_n = 0 shared_globals.logger.info(str(self.model)) shared_globals.current_learning_rate = config.optim_config['base_lr'] self.optimizer, self.scheduler = create_optimizer( self.model.parameters(), config.optim_config) print("optimizer:", self.optimizer) loss_criterion_args = dict(config.loss_criterion_args) self.criterion = get_criterion( config.loss_criterion)(**loss_criterion_args) # init state inf_value = -float("inf") if self.config["optim_config"].get("model_selection", {}).get("select_min", False): inf_value = float("inf") self.state = { # 'config': self.config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'metrics': {}, 'best_metric_value': inf_value, 'best_epoch': 0, } self.first_batch_done = False # init dataset loaders self.init_loaders() if config.get('load_model'): if not config.get("load_model_no_test_first"): testing_result = {} for name in self.config.datasets: dataset_config = AttrDefault(lambda: None, self.config.datasets[name]) if dataset_config.testing: testing_result[name] = self.test( 0, name, dataset_config) # updating the state with new results self.update_state(testing_result, 0)
def __init__(self, config, seed=42, mixed_precision_training=False): global logger logger = shared_globals.logger config = AttrDefault(lambda: None, config) self.config = config self.datasets = {} self.data_loaders = {} self.use_swa = config.use_swa self.prune_mode = config.get("prune_mode") #self.run.info['epoch'] = 0 # set random seed torch.manual_seed(seed) np.random.seed(seed + 1) random.seed(seed + 2) self.min_lr = self.config.optim_config["min_lr"] if self.min_lr is None: self.min_lr = 0.0 print(self.min_lr) # making outout dirs models_outputdir = os.path.join(config.out_dir, "models") if not os.path.exists(config.out_dir): os.makedirs(config.out_dir) if not os.path.exists(models_outputdir): os.makedirs(models_outputdir) #self.run.info['out_path'] = config.out_dir self.colab_mode = False self.mixed_precision_training = mixed_precision_training if mixed_precision_training: print("\n\nUsing mixed_precision_training\n\n ") self.scaler = torch.cuda.amp.GradScaler() # init_loggers self.init_loggers() self.dataset_manager = DatasetsManager(self.config['audiodataset']) # init Tensor board if self.config.tensorboard: tensorboard_write_path = config.tensorboard_write_path if not tensorboard_write_path: tensorboard_write_path = self.config.out_dir.replace( "out", "runs", 1) shared_globals.console.info("tensorboard run path: " + tensorboard_write_path) shared_globals.console.info("To monitor this experiment use:\n " + shared_globals.bcolors.FAIL + "tensorboard --logdir " + tensorboard_write_path + shared_globals.bcolors.ENDC) #self.run.info['tensorboard_path'] = tensorboard_write_path self.writer = SummaryWriter(tensorboard_write_path) # init multi gpu self.bare_model = load_model(config.model_config) print(self.bare_model) if self.use_swa: self.swa_model = load_model(config.model_config) if self.config.use_gpu: self.swa_model.cuda() self.swa_n = 0 self.swa_c_epochs = config.swa_c_epochs self.swa_start = config.swa_start # print number of parameters print("Trainable model parameters {}, non-trainable {} ".format( count_parameters(self.bare_model), count_parameters(self.bare_model, False))) print("Trainable model parameters non-zero {} ".format( count_non_zero_params(self.bare_model))) # move to gpu if self.config.use_gpu: self.bare_model.cuda() if self.prune_mode: try: true_params = self.bare_model.get_num_true_params() prunable_params = self.bare_model.get_num_prunable_params() shared_globals.console.info( "True model parameters {}, Prunable params {} ".format( true_params, prunable_params)) except AttributeError: raise true_params = prunable_params = count_parameters( self.bare_model) shared_globals.console.info( "WARNING:\n\nmodel doens't support true/prunable: True {}, Prunable params {} \n\n" .format(true_params, prunable_params)) if self.config.prune_percentage == -1: # -1 means auto must_prune_params = true_params - self.config.prune_percentage_target_params self.real_prune_percentage = must_prune_params / prunable_params if self.real_prune_percentage >= 0.9999: raise RuntimeError( "real_prune_percentage {} >= ~ 1.".format( self.real_prune_percentage)) if self.real_prune_percentage >= 0.9: print("\n\nWarning: very high real_prune_percentage\n\n", self.real_prune_percentage) if self.real_prune_percentage < 0: raise RuntimeError("real_prune_percentage {} <0.".format( self.real_prune_percentage)) print("\nWARNING: real_prune_percentage<0: ", self.real_prune_percentage, " setting to 0.1\n") self.real_prune_percentage = 0.1 else: self.real_prune_percentage = self.config.prune_percentage print("current prunning percentage=", self.real_prune_percentage) shared_globals.console.info( "\n\nTrainable model parameters {}, non-trainable {} \n\n".format( count_parameters(self.bare_model), count_parameters(self.bare_model, False))) # DataParallel mode if not config.parallel_mode: self.model = self.bare_model elif config.parallel_mode == "distributed": torch.distributed.init_process_group( backend='nccl', world_size=1, rank=0, init_method='file://' + config.out_dir + "/shared_file") self.model = torch.nn.parallel.DistributedDataParallel( self.bare_model) else: self.model = torch.nn.DataParallel(self.bare_model) # self.model.cuda() # if load_model if config.get('load_model'): load_model_path = config.get('load_model') load_model_path = os.path.expanduser(load_model_path) shared_globals.console.info("Loading model located at: " + load_model_path) checkpoint = torch.load(load_model_path) self.model.load_state_dict(checkpoint['state_dict']) if self.use_swa: swa_state_dict = checkpoint.get('swa_state_dict', None) self.swa_n = checkpoint.get('swa_n', 0) if (swa_state_dict is not None) and not self.config.swa_model_load_same: self.swa_model.load_state_dict(swa_state_dict) else: shared_globals.console.warning( "No swa_state_dict loaded! same loaded") self.swa_model.load_state_dict(checkpoint['state_dict']) self.swa_n = 0 shared_globals.logger.info(str(self.model)) shared_globals.current_learning_rate = config.optim_config['base_lr'] self.optimizer, self.scheduler = create_optimizer( self.model.parameters(), config.optim_config) print("optimizer:", self.optimizer) loss_criterion_args = dict(config.loss_criterion_args) self.criterion = get_criterion( config.loss_criterion)(**loss_criterion_args) # init state inf_value = -float("inf") if self.config["optim_config"].get("model_selection", {}).get("select_min", False): inf_value = float("inf") self.state = { # 'config': self.config, 'state_dict': None, 'optimizer': None, 'epoch': 0, 'metrics': {}, 'best_metric_value': inf_value, 'best_epoch': 0, } self.first_batch_done = False # init dataset loaders self.init_loaders() if config.get('load_model'): if not config.get("load_model_no_test_first"): testing_result = {} for name in self.config.datasets: dataset_config = AttrDefault(lambda: None, self.config.datasets[name]) if dataset_config.testing: testing_result[name] = self.test( 0, name, dataset_config) # updating the state with new results self.update_state(testing_result, 0)