def test_load_linear(self): # Initialize model with new random seed. set_random_seed(33) model = MNISTSparseCNN() model.eval() # Check output through the full network. for param1, param2 in zip(model.parameters(), self.model.parameters()): tot_eq = (param1 == param2).sum().item() self.assertNotEqual(tot_eq, np.prod(param1.shape)) # Check output through the lower network. out = lower_forward(model, self.in_1) num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item() self.assertEqual(num_matches, 1337) # some correct # Check output through the lower network. out = upper_forward(model, self.in_2) num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item() self.assertEqual(num_matches, 1) # some correct # Restore full model. model = load_multi_state(model, restore_linear=self.checkpoint_path) model.eval() # Check output through the lower network. out = lower_forward(model, self.in_1) num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item() self.assertEqual(num_matches, 1337) # some correct # Check output through the lower network. out = upper_forward(model, self.in_2) num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item() self.assertEqual(num_matches, 20) # all correct
def setUp(self): set_random_seed(20) self.model = MNISTSparseCNN() self.model.eval() # Make all params twice as large to differentiate it from an init-ed model. for name, param in self.model.named_parameters(): if ("cnn" in name or "linear" in name) and ("weight" in name): param[:] = param.data * 2 # self.model.eval() self.in_1 = torch.rand(2, 1, 28, 28) self.in_2 = torch.rand(2, 1024) self.out_full = full_forward(self.model, self.in_1) self.out_lower = lower_forward(self.model, self.in_1) self.out_upper = upper_forward(self.model, self.in_2) # Create temporary results directory. self.tempdir = tempfile.TemporaryDirectory() self.results_dir = Path(self.tempdir.name) / Path("results") self.results_dir.mkdir() # Save model state. state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1) state["model"] = buffer.getvalue() self.checkpoint_path = self.results_dir / Path("mymodel") with open(self.checkpoint_path, "wb") as f: pickle.dump(state, f)
def __init__(self, config): super(MobileNetCIFAR10, self).__init__() self.logger = get_logger(config["name"], config["verbose"]) self.logger.debug("Config: %s", config) # Setup random seed seed = config["seed"] set_random_seed(seed) self._configure_dataloaders(config) # Configure Model model_type = config["model_type"] model_params = config["model_params"] self.model = model_type(**model_params) self.logger.debug("Model: %s", self.model) if torch.cuda.is_available(): self.device = torch.device("cuda") self.model = self.model.cuda() else: self.device = torch.device("cpu") # Configure Optimizer. Skip weight decay on deep-wise params = [ { "params": self.model.conv.parameters() }, { "params": self.model.deepwise.parameters(), "weight_decay": 0 }, { "params": self.model.classifier.parameters() }, ] self.optimizer = torch.optim.RMSprop( params, lr=config["learning_rate"], weight_decay=config["weight_decay"]) self.loss_function = config["loss_function"] self.lr_scheduler = torch.optim.lr_scheduler.StepLR( self.optimizer, step_size=config["lr_step_size"], gamma=config["learning_rate_gamma"], ) if torch.cuda.device_count() > 1: self.model = torch.nn.DataParallel(self.model) self.batches_in_epoch = config["batches_in_epoch"] self.batches_in_first_epoch = config["batches_in_first_epoch"] self.test_batches_in_epoch = config["test_batches_in_epoch"] self.config = config
def run_ray_many(tune_config, exp_config, experiments, fix_seed=False): # update config tune_config["config"] = exp_config # override when running local for test if not torch.cuda.is_available(): tune_config["config"]["device"] = "cpu" tune_config["resources_per_trial"] = {"cpu": 1} # MC code to fix for an unknown bug def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer( t, serializer=serializer, deserializer=deserializer ) # fix seed if fix_seed: set_random_seed(32) # multiple experiments exp_configs = [ (name, new_experiment(exp_config, c)) for name, c in experiments.items() ] # init ray ray.init() results = [ run_experiment.remote(name, RayTrainable, c, tune_config) for name, c in exp_configs ] ray.get(results) ray.shutdown()
def setUp(self): set_random_seed(42) self.device = torch.device("cuda") # Config for model with sparse encoder and sparse embedding layer. self.config = CONFIG_MAPPING["fully_static_sparse_bert"]( num_attention_heads=2, num_hidden_layers=2, hidden_size=128, intermediate_size=512, max_position_embeddings=128, sparsity=0.75, ) self.sparse_model = AutoModelForMaskedLM.from_config(self.config) self.sparse_model.resize_token_embeddings() self.sparse_model.apply(rezero_weights)
def run_ray(tune_config, exp_config, fix_seed=False): # update config tune_config["config"] = exp_config download_dataset(exp_config) # override when running local for test if not torch.cuda.is_available(): tune_config["config"]["device"] = "cpu" tune_config["resources_per_trial"] = {"cpu": 1} # init ray ray.init(load_code_from_local=True) # MC code to fix for an unknown bug def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer(t, serializer=serializer, deserializer=deserializer) # fix seed if fix_seed: set_random_seed(32) tune.run(Trainable, **tune_config)
def test_load_full(self): # Initialize model with new random seed. set_random_seed(33) model = MNISTSparseCNN() model.eval() # Check output through the full network. for param1, param2 in zip(model.parameters(), self.model.parameters()): tot_eq = (param1 == param2).sum().item() self.assertNotEqual(tot_eq, np.prod(param1.shape)) # Restore full model. model = load_multi_state(model, restore_full_model=self.checkpoint_path) model.eval() # Check output through the full network. for param1, param2 in zip(model.parameters(), self.model.parameters()): tot_eq = (param1 == param2).sum().item() self.assertEqual(tot_eq, np.prod(param1.shape)) for buffer1, buffer2 in zip(model.buffers(), self.model.buffers()): if buffer1.dtype == torch.float16: buffer1 = buffer1.float() buffer2 = buffer2.float() tot_eq = (buffer1 == buffer2).sum().item() self.assertEqual(tot_eq, np.prod(buffer1.shape)) out = full_forward(model, self.in_1) num_matches = out.isclose(self.out_full, atol=1e-2, rtol=0).sum().item() self.assertEqual(num_matches, 20) # all correct # Check output through the lower network. out = lower_forward(model, self.in_1) num_matches = out.isclose(self.out_lower, atol=1e-2).sum().item() self.assertEqual(num_matches, 2048) # all correct # Check output through the lower network. out = upper_forward(model, self.in_2) num_matches = out.isclose(self.out_upper, atol=1e-2).sum().item() self.assertEqual(num_matches, 20) # all correct
def setup_experiment(self, config): """ :param config: Dictionary containing the configuration parameters - local_dir: Results path - logdir: Directory generated by Ray Tune for this Trial - seed: the seed to be used for pytorch, python, and numpy - checkpoint_at_init: boolean argument for whether to create a checkpoint of the initialized model. this differs from `checkpoint_at_start` for which the checkpoint occurs after the first epoch of training as opposed to before it """ self._logger = self.create_logger(config) self.logdir = config.get("logdir", None) # Configure seed self.seed = config.get("seed", 42) set_random_seed(self.seed, False)
def setUp(self): set_random_seed(20) self.model = torch.nn.Sequential( torch.nn.Linear(8, 8), KWinners(8, percent_on=0.1), ) # Create temporary results directory. self.tempdir = tempfile.TemporaryDirectory() self.results_dir = Path(self.tempdir.name) / Path("results") self.results_dir.mkdir() # Save model state. state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1) state["model"] = buffer.getvalue() self.checkpoint_path = self.results_dir / Path("mymodel") with open(self.checkpoint_path, "wb") as f: pickle.dump(state, f)
def main(args=CONFIG): if args.seed is not None: set_random_seed(args.seed, args.deterministic_mode) if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
def main(args): # Get experiment configuration config = copy.deepcopy(CONFIGS[args.name]) config.update(vars(args)) device = "cuda" if torch.cuda.is_available() else "cpu" config["device"] = device # Replace dynamic seed (i.e. 'tune.sample_from') with constant seed = config.get("seed", 42) if not isinstance(seed, int): seed = 42 config["seed"] = seed set_random_seed(seed) q_model = quantize(config) # Save quantized model output_file_name = os.path.join(args.output, f"{args.name}.{args.backend}.pt") print( f"Saving quantized model '{args.name}' weights to '{output_file_name}'" ) torch.jit.save(torch.jit.script(q_model), output_file_name)
def setUp(self): set_random_seed(42)
def __init__(self, config): """Called once at the beginning of each experiment.""" super(MNISTSparseExperiment, self).__init__() self.start_time = time.time() self.logger = get_logger(config["name"], config.get("verbose", 2)) self.logger.debug("Config: %s", config) # Setup random seed seed = config["seed"] set_random_seed(seed) self.data_dir = config["data_dir"] self.batch_size = config["batch_size"] self.test_batch_size = config["test_batch_size"] self.first_epoch_batch_size = config["first_epoch_batch_size"] self.validation = config.get("validation", 50000.0 / 60000.0) self.learning_rate_factor = config["learning_rate_factor"] self.lr_scheduler_params = config.get("lr_scheduler_params", None) self.num_classes = 10 self._configure_dataloaders() # Configure Model model = LeSparseNet( input_shape=(1, 28, 28), cnn_out_channels=config["cnn_out_channels"], cnn_activity_percent_on=config["cnn_percent_on"], cnn_weight_percent_on=config["cnn_weight_sparsity"], linear_n=config["linear_n"], linear_activity_percent_on=config["linear_percent_on"], linear_weight_percent_on=config["weight_sparsity"], boost_strength=config["boost_strength"], boost_strength_factor=config["boost_strength_factor"], use_batch_norm=config["use_batch_norm"], dropout=config.get("dropout", 0.0), num_classes=self.num_classes, k_inference_factor=config["k_inference_factor"], activation_fct_before_max_pool=config.get( "activation_fct_before_max_pool", False), consolidated_sparse_weights=config.get( "consolidated_sparse_weights", False), use_kwinners_local=config.get("use_kwinner_local", False), ) if torch.cuda.is_available(): self.device = torch.device("cuda") model = model.cuda() else: self.device = torch.device("cpu") if torch.cuda.device_count() > 1: self.logger.debug("Using", torch.cuda.device_count(), "GPUs") model = torch.nn.DataParallel(model) self.model = model self.logger.debug("Model: %s", self.model) self.learning_rate = config["learning_rate"] self.momentum = config["momentum"] self.batches_in_epoch = config["batches_in_epoch"] self.batches_in_first_epoch = config["batches_in_first_epoch"] self.config = config self.optimizer = self._create_optimizer(name=config["optimizer"], model=self.model) self.lr_scheduler = self._create_learning_rate_scheduler( name=config.get("lr_scheduler", None), optimizer=self.optimizer)
def __init__(self, config): """Called once at the beginning of each experiment.""" self.start_time = time.time() self.logger = get_logger(config["name"], config.get("verbose", 2)) self.logger.debug("Config: %s", config) # Setup random seed seed = config["seed"] set_random_seed(seed) # Get our directories correct self.data_dir = config["data_dir"] # Configure Model self.model_type = config["model_type"] self.num_classes = 12 self.log_interval = config["log_interval"] self.batches_in_epoch = config["batches_in_epoch"] self.batch_size = config["batch_size"] self.background_noise_dir = config["background_noise_dir"] self.noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] self.load_datasets() if self.model_type == "le_sparse": model = LeSparseNet( input_shape=config.get("input_shape", (1, 32, 32)), cnn_out_channels=config["cnn_out_channels"], cnn_activity_percent_on=config["cnn_percent_on"], cnn_weight_percent_on=config["cnn_weight_sparsity"], linear_n=config["linear_n"], linear_activity_percent_on=config["linear_percent_on"], linear_weight_percent_on=config["weight_sparsity"], boost_strength=config["boost_strength"], boost_strength_factor=config["boost_strength_factor"], use_batch_norm=config["use_batch_norm"], dropout=config.get("dropout", 0.0), num_classes=self.num_classes, k_inference_factor=config["k_inference_factor"], activation_fct_before_max_pool=config.get( "activation_fct_before_max_pool", False), consolidated_sparse_weights=config.get( "consolidated_sparse_weights", False), use_kwinners_local=config.get("use_kwinner_local", False), ) elif self.model_type == "resnet9": model = resnet9( num_classes=self.num_classes, in_channels=1 ) elif self.model_type == "gsc_sparse_cnn": model = GSCSparseCNN() elif self.model_type == "gsc_super_sparse_cnn": model = GSCSuperSparseCNN() else: raise RuntimeError("Unknown model type: " + self.model_type) self.use_cuda = torch.cuda.is_available() self.logger.debug("use_cuda %s", self.use_cuda) if self.use_cuda: self.device = torch.device("cuda") model = model.cuda() else: self.device = torch.device("cpu") self.logger.debug("device %s", self.device) if torch.cuda.device_count() > 1: self.logger.debug("Using %s GPUs", torch.cuda.device_count()) model = torch.nn.DataParallel(model) self.model = model self.logger.debug("Model: %s", self.model) self.logger.debug("Model non-zero params: %s", count_nonzero_params(self.model)) self.learning_rate = config["learning_rate"] self.optimizer = self.create_optimizer(config, self.model) self.lr_scheduler = self.create_learning_rate_scheduler(config, self.optimizer)
def __init__(self, config): """Called once at the beginning of each experiment.""" self.start_time = time.time() self.logger = get_logger(config["name"], config.get("verbose", 2)) self.logger.debug("Config: %s", config) # Setup random seed seed = config["seed"] set_random_seed(seed) # Get our directories correct self.data_dir = config["data_dir"] # Configure Model self.model_type = config["model_type"] self.num_classes = 12 self.log_interval = config["log_interval"] self.batches_in_epoch = config["batches_in_epoch"] self.batch_size = config["batch_size"] self.background_noise_dir = config["background_noise_dir"] self.noise_values = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] cnn_input_shape = config.get("cnn_input_shape", (1, 32, 32)) linear_n = config["linear_n"] linear_percent_on = config["linear_percent_on"] cnn_out_channels = config["cnn_out_channels"] cnn_percent_on = config["cnn_percent_on"] boost_strength = config["boost_strength"] weight_sparsity = config["weight_sparsity"] cnn_weight_sparsity = config["cnn_weight_sparsity"] boost_strength_factor = config["boost_strength_factor"] k_inference_factor = config["k_inference_factor"] use_batch_norm = config["use_batch_norm"] dropout = config.get("dropout", 0.0) self.load_datasets() model = nn.Sequential() if self.model_type == "cnn": # Add CNN Layers input_shape = cnn_input_shape cnn_layers = len(cnn_out_channels) if cnn_layers > 0: for i in range(cnn_layers): in_channels, height, width = input_shape add_sparse_cnn_layer( network=model, suffix=i + 1, in_channels=in_channels, out_channels=cnn_out_channels[i], use_batch_norm=use_batch_norm, weight_sparsity=cnn_weight_sparsity, percent_on=cnn_percent_on[i], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ) # Feed this layer output into next layer input in_channels = cnn_out_channels[i] # Compute next layer input shape wout = (width - 5) + 1 maxpool_width = wout // 2 input_shape = (in_channels, maxpool_width, maxpool_width) # Flatten CNN output before passing to linear layer model.add_module("flatten", Flatten()) # Add Linear layers input_size = np.prod(input_shape) for i in range(len(linear_n)): add_sparse_linear_layer( network=model, suffix=i + 1, input_size=input_size, linear_n=linear_n[i], dropout=dropout, use_batch_norm=use_batch_norm, weight_sparsity=weight_sparsity, percent_on=linear_percent_on[i], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ) input_size = linear_n[i] # Output layer model.add_module( "output", nn.Linear(input_size, self.num_classes) ) model.add_module("softmax", nn.LogSoftmax(dim=1)) elif self.model_type == "resnet9": model = resnet9( num_classes=self.num_classes, in_channels=1 ) elif self.model_type == "gsc_sparse_cnn": model = GSCSparseCNN() elif self.model_type == "gsc_super_sparse_cnn": model = GSCSuperSparseCNN() else: raise RuntimeError("Unknown model type") self.use_cuda = torch.cuda.is_available() self.logger.debug("use_cuda %s", self.use_cuda) if self.use_cuda: self.device = torch.device("cuda") model = model.cuda() else: self.device = torch.device("cpu") self.logger.debug("device %s", self.device) if torch.cuda.device_count() > 1: self.logger.debug("Using %s GPUs", torch.cuda.device_count()) model = torch.nn.DataParallel(model) self.model = model self.logger.debug("Model: %s", self.model) self.learning_rate = config["learning_rate"] self.optimizer = self.create_optimizer(config, self.model) self.lr_scheduler = self.create_learning_rate_scheduler(config, self.optimizer)
# Run experiments results = [] for exp in configs: config = configs[exp] config["name"] = exp # Make sure path and data_dir are relative to the project location, # handling both ~/nta and ../results style paths. path = config.get("path", ".") config["path"] = str(Path(path).expanduser().resolve()) data_dir = config.get("data_dir", "data") config["data_dir"] = str(Path(data_dir).expanduser().resolve()) # Run each experiment in parallel results.append(run_noise_test.remote(config)) # Wait until all experiments complete ray.get(results) ray.shutdown() if __name__ == "__main__": # Set a random random seed, and print it for reproducibility # This enables variability in the random seeds that Ray generates for # experiments with multiple repetitions. seed = int(time.time()) print("Global random seed set to", seed) set_random_seed(seed) cli()
def setup_experiment(self, config): """ Configure the experiment for training :param config: Dictionary containing the configuration parameters - distributed: Whether or not to use Pytorch Distributed training - backend: Pytorch Distributed backend ("nccl", "gloo") Default: nccl - world_size: Total number of processes participating - rank: Rank of the current process - data: Dataset path - train_dir: Dataset training data relative path - batch_size: Training batch size - val_dir: Dataset validation data relative path - val_batch_size: Validation batch size - workers: how many data loading processes to use - num_classes: Limit the dataset size to the given number of classes - model_class: Model class. Must inherit from "torch.nn.Module" - model_args: model model class arguments passed to the constructor - init_batch_norm: Whether or not to Initialize running batch norm mean to 0. - optimizer_class: Optimizer class. Must inherit from "torch.optim.Optimizer" - optimizer_args: Optimizer class class arguments passed to the constructor - batch_norm_weight_decay: Whether or not to apply weight decay to batch norm modules parameters - bias_weight_decay: Whether or not to apply weight decay to bias parameters - lr_scheduler_class: Learning rate scheduler class. Must inherit from "_LRScheduler" - lr_scheduler_args: Learning rate scheduler class class arguments passed to the constructor - loss_function: Loss function. See "torch.nn.functional" - local_dir: Results path - logdir: Directory generated by Ray Tune for this Trial - epochs: Number of epochs to train - batches_in_epoch: Number of batches per epoch. Useful for debugging - progress: Show progress during training - name: Experiment name. Used as logger name - log_level: Python Logging level - log_format: Python Logging format - seed: the seed to be used for pytorch, python, and numpy - mixed_precision: Whether or not to enable apex mixed precision - mixed_precision_args: apex mixed precision arguments. See "amp.initialize" - sample_transform: Transform acting on the training samples. To be used additively after default transform or auto-augment. - target_transform: Transform acting on the training targets. - replicas_per_sample: Number of replicas to create per sample in the batch. (each replica is transformed independently) Used in maxup. - train_model_func: Optional user defined function to train the model, expected to behave similarly to `train_model` in terms of input parameters and return values - evaluate_model_func: Optional user defined function to validate the model expected to behave similarly to `evaluate_model` in terms of input parameters and return values - checkpoint_file: if not None, will start from this model. The model must have the same model_args and model_class as the current experiment. - checkpoint_at_init: boolean argument for whether to create a checkpoint of the initialized model. this differs from `checkpoint_at_start` for which the checkpoint occurs after the first epoch of training as opposed to before it - epochs_to_validate: list of epochs to run validate(). A -1 asks to run validate before any training occurs. Default: last three epochs. - launch_time: time the config was created (via time.time). Used to report wall clock time until the first batch is done. Default: time.time() in this setup_experiment(). """ # Configure logging related stuff log_format = config.get("log_format", logging.BASIC_FORMAT) log_level = getattr(logging, config.get("log_level", "INFO").upper()) console = logging.StreamHandler() console.setFormatter(logging.Formatter(log_format)) self.logger = logging.getLogger(config.get("name", type(self).__name__)) self.logger.setLevel(log_level) self.logger.addHandler(console) self.progress = config.get("progress", False) self.launch_time = config.get("launch_time", time.time()) self.logdir = config.get("logdir", None) # Configure seed self.seed = config.get("seed", self.seed) set_random_seed(self.seed, False) # Configure distribute pytorch self.distributed = config.get("distributed", False) self.rank = config.get("rank", 0) if self.rank == 0: self.logger.info( f"Execution order: {pformat(self.get_execution_order())}") if self.distributed: dist_url = config.get("dist_url", "tcp://127.0.0.1:54321") backend = config.get("backend", "nccl") world_size = config.get("world_size", 1) dist.init_process_group( backend=backend, init_method=dist_url, rank=self.rank, world_size=world_size, ) # Only enable logs from first process self.logger.disabled = self.rank != 0 self.progress = self.progress and self.rank == 0 # Configure model self.model = self.create_model(config, self.device) if self.rank == 0: self.logger.debug(self.model) # Configure optimizer optimizer_class = config.get("optimizer_class", torch.optim.SGD) optimizer_args = config.get("optimizer_args", {}) batch_norm_weight_decay = config.get("batch_norm_weight_decay", True) bias_weight_decay = config.get("bias_weight_decay", True) self.optimizer = create_optimizer( model=self.model, optimizer_class=optimizer_class, optimizer_args=optimizer_args, batch_norm_weight_decay=batch_norm_weight_decay, bias_weight_decay=bias_weight_decay, ) # Validate mixed precision requirements self.mixed_precision = config.get("mixed_precision", False) if self.mixed_precision and amp is None: self.mixed_precision = False self.logger.error( "Mixed precision requires NVIDA APEX." "Please install apex from https://www.github.com/nvidia/apex" "Disabling mixed precision training.") # Configure mixed precision training if self.mixed_precision: amp_args = config.get("mixed_precision_args", {}) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, **amp_args) self.logger.info("Using mixed precision") # Apply DistributedDataParallel after all other model mutations if self.distributed: self.model = DistributedDataParallel(self.model) else: self.model = DataParallel(self.model) self._loss_function = config.get("loss_function", torch.nn.functional.cross_entropy) # Configure data loaders self.epochs = config.get("epochs", 1) self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize) self.epochs_to_validate = config.get( "epochs_to_validate", range(self.epochs - 3, self.epochs + 1)) self.current_epoch = 0 # Get initial batch size self.batch_size = config.get("batch_size", 1) # CUDA runtime does not support the fork start method. # See https://pytorch.org/docs/stable/notes/multiprocessing.html if torch.cuda.is_available(): multiprocessing.set_start_method("spawn") # Configure data loaders self.train_loader = self.create_train_dataloader(config) self.val_loader = self.create_validation_dataloader(config) self.total_batches = len(self.train_loader) # Configure learning rate scheduler lr_scheduler_class = config.get("lr_scheduler_class", None) if lr_scheduler_class is not None: lr_scheduler_args = config.get("lr_scheduler_args", {}) self.logger.info("LR Scheduler args:") self.logger.info(pformat(lr_scheduler_args)) self.logger.info("steps_per_epoch=%s", self.total_batches) self.lr_scheduler = create_lr_scheduler( optimizer=self.optimizer, lr_scheduler_class=lr_scheduler_class, lr_scheduler_args=lr_scheduler_args, steps_per_epoch=self.total_batches) # Set train and validate methods. self.train_model = config.get("train_model_func", train_model) self.evaluate_model = config.get("evaluate_model_func", evaluate_model)
def __init__(self, config): super(NotSoDenseExperiment, self).__init__() self.logger = get_logger(config["name"], config["verbose"]) self.logger.debug("Config: %s", config) seed = config["seed"] set_random_seed(seed) self.batches_in_epoch = config["batches_in_epoch"] self.epochs = config["iterations"] self.batch_size = config["batch_size"] self.test_batch_size = config["test_batch_size"] self.test_batches_in_epoch = config.get("test_batches_in_epoch", sys.maxsize) data_dir = config["data_dir"] normalize_tensor = [ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ] data_augmentation = [] if config.get("data_augmentation", False): data_augmentation = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip() ] train_dataset = datasets.CIFAR10( root=data_dir, train=True, download=True, transform=transforms.Compose(data_augmentation + normalize_tensor)) self.train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=self.batch_size, shuffle=True) test_dataset = datasets.CIFAR10( root=data_dir, train=False, download=False, transform=transforms.Compose(normalize_tensor)) self.test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=self.test_batch_size, shuffle=True) self.model = NoSoDenseNetCIFAR( block_config=config.get("block_config"), depth=config.get("depth"), growth_rate=config["growth_rate"], reduction=config["reduction"], num_classes=config["num_classes"], bottleneck_size=config["bottleneck_size"], avg_pool_size=config["avg_pool_size"], dense_percent_on=config["dense_percent_on"], transition_percent_on=config["transition_percent_on"], classifier_percent_on=config["classifier_percent_on"], k_inference_factor=config["k_inference_factor"], boost_strength=config["boost_strength"], boost_strength_factor=config["boost_strength_factor"], duty_cycle_period=config["duty_cycle_period"], ) self.logger.debug("Model: %s", self.model) if torch.cuda.is_available(): self.device = torch.device("cuda") self.model = self.model.cuda() else: self.device = torch.device("cpu") self.optimizer = torch.optim.SGD(self.model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], nesterov=config["nesterov"], weight_decay=config["weight_decay"]) self.loss_function = config["loss_function"] if "learning_scheduler_milestones" in config: self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, gamma=config["learning_scheduler_gamma"], milestones=config["learning_scheduler_milestones"]) else: self.scheduler = torch.optim.lr_scheduler.StepLR( self.optimizer, gamma=config["learning_scheduler_gamma"], step_size=config["learning_scheduler_step_size"])
def run_ray(tune_config, exp_config, fix_seed=False): # update config tune_config["config"] = exp_config # override when running local for test if not torch.cuda.is_available(): tune_config["config"]["device"] = "cpu" tune_config["resources_per_trial"] = {"cpu": 1} # move epochs to tune_config, to keep track if "stop" not in tune_config: if "epochs" in exp_config: tune_config["stop"] = {"training_iteration": exp_config["epochs"]} # expand path in dir if "local_dir" in tune_config: tune_config["local_dir"] = os.path.expanduser(tune_config["local_dir"]) else: tune_config["local_dir"] = os.path.expanduser("~/nta/results") # saves a copy of local dir to exp config for LT experiments exp_config["local_dir"] = tune_config["local_dir"] if "data_dir" not in exp_config: exp_config["data_dir"] = os.path.expanduser("~/nta/datasets") download_dataset(exp_config) # set default checkpoint dir # temp: name and checkpoint dir in tune_config for backwards compatibility exp_config["name"] = tune_config["name"] if "checkpoint dir" in tune_config: exp_config["checkpoint_dir"] = os.path.expanduser(exp_config["checkpoint_dir"]) else: exp_config["checkpoint_dir"] = os.path.expanduser("~/nta/checkpoints") # init ray ray.init(load_code_from_local=True) # MC code to fix for an unknown bug def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer( t, serializer=serializer, deserializer=deserializer ) # fix seed if fix_seed: set_random_seed(32) # allows different kind of experiments to run run_experiment = base_experiment if "experiment_type" in exp_config: if exp_config["experiment_type"] in custom_experiments: run_experiment = custom_experiments[exp_config["experiment_type"]] else: raise ValueError("Experiment type not available.") # run run_experiment(tune_config)
num_gpus = float(num_gpus / num_cpus) run_noise_test._num_gpus = num_gpus run_noise_test.num_cpus = 1 # Run experiments results = [] for exp in configs: config = configs[exp] config["name"] = exp # Make sure local directories are relative to the project location path = config.get("path", None) if path and not os.path.isabs(path): config["path"] = os.path.join(project_dir, path) data_dir = config.get("data_dir", "data") if not os.path.isabs(data_dir): config["data_dir"] = os.path.join(project_dir, data_dir) # Run each experiment in parallel results.append(run_noise_test.remote(config)) # Wait until all experiments complete ray.get(results) ray.shutdown() if __name__ == "__main__": set_random_seed(18) cli()
def setup_experiment(self, config): """ Configure the experiment for training :param config: Dictionary containing the configuration parameters - distributed: Whether or not to use Pytorch Distributed training - backend: Pytorch Distributed backend ("nccl", "gloo") - world_size: Total number of processes participating - rank: Rank of the current process - data: Dataset path - train_dir: Dataset training data relative path - batch_size: Training batch size - val_dir: Dataset validation data relative path - val_batch_size: Validation batch size - workers: how many data loading processes to use - num_classes: Limit the dataset size to the given number of classes - model_class: Model class. Must inherit from "torch.nn.Module" - model_args: model model class arguments passed to the constructor - init_batch_norm: Whether or not to Initialize running batch norm mean to 0. - progressive_resize: Progressive resize schedule dict(start_epoch: image_size) - dynamic_batch_size: dynamic batch size schedule. dict(start_epoch: batch_size) Works with progressive_resize and the available GPU memory to fit as many images as possible in each batch - optimizer_class: Optimizer class. Must inherit from "torch.optim.Optimizer" - optimizer_args: Optimizer class class arguments passed to the constructor - batch_norm_weight_decay: Whether or not to apply weight decay to batch norm modules parameters - lr_scheduler_class: Learning rate scheduler class. Must inherit from "_LRScheduler" - lr_scheduler_args: Learning rate scheduler class class arguments passed to the constructor - loss_function: Loss function. See "torch.nn.functional" - local_dir: Results path - epochs: Number of epochs to train - batches_in_epoch: Number of batches per epoch. Useful for debugging - progress: Show progress during training - name: Experiment name. Used as logger name - log_level: Python Logging level - log_format: Python Logging format - seed: the seed to be used for pytorch, python, and numpy """ # Configure logger log_format = config.get("log_format", logging.BASIC_FORMAT) log_level = getattr(logging, config.get("log_level", "INFO").upper()) console = logging.StreamHandler() console.setFormatter(logging.Formatter(log_format)) self.logger = logging.getLogger(config.get("name", type(self).__name__)) self.logger.setLevel(log_level) self.logger.addHandler(console) self.progress = config.get("progress", False) # Configure seed self.seed = config.get("seed", self.seed) set_random_seed(self.seed, False) # Configure distribute pytorch self.distributed = config.get("distributed", False) self.rank = config.get("rank", 0) if self.distributed: dist_url = config.get("dist_url", "tcp://127.0.0.1:54321") backend = config.get("backend", "nccl") world_size = config.get("world_size", 1) dist.init_process_group( backend=backend, init_method=dist_url, rank=self.rank, world_size=world_size, ) # Configure model model_class = config["model_class"] model_args = config.get("model_args", {}) init_batch_norm = config.get("init_batch_norm", False) self.model = _create_model( model_class=model_class, model_args=model_args, init_batch_norm=init_batch_norm, distributed=self.distributed, device=self.device, ) if self.rank == 0: self.logger.debug(self.model) params_sparse, nonzero_params_sparse2 = count_nonzero_params( self.model) self.logger.debug("Params total/nnz %s / %s", params_sparse, nonzero_params_sparse2) # Configure optimizer optimizer_class = config.get("optimizer_class", torch.optim.SGD) optimizer_args = config.get("optimizer_args", {}) batch_norm_weight_decay = config.get("batch_norm_weight_decay", True) self.optimizer = _create_optimizer( model=self.model, optimizer_class=optimizer_class, optimizer_args=optimizer_args, batch_norm_weight_decay=batch_norm_weight_decay, ) self.loss_function = config.get("loss_function", torch.nn.functional.cross_entropy) # Configure data loaders self.epochs = config.get("epochs", 1) self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize) workers = config.get("workers", 0) data_dir = config["data"] train_dir = config.get("train_dir", "train") progressive_resize = config.get("progressive_resize", None) num_classes = config.get("num_classes", 1000) # Get initial batch size self.batch_size = config.get("batch_size", 1) # Configure dynamic training batch size dynamic_batch_size = config.get("dynamic_batch_size", None) if dynamic_batch_size is not None: # Convert dynamic_batch_size dict from {str:int} to {int:int} self.dynamic_batch_size = { int(k): v for k, v in dynamic_batch_size.items() } # Override initial batch size from dynamic_batch_size schedule milestones = sorted(self.dynamic_batch_size.keys()) self.batch_size = self.dynamic_batch_size[milestones[0]] # Scale LR proportionally to initial batch size for each epoch milestone # See https://arxiv.org/pdf/1706.02677.pdf lr_scale = {milestones[0]: 1.0} lr_scale.update({ k: self.dynamic_batch_size[k] / self.batch_size for k in milestones[1:] }) # Create chained scaled LR scheduler to be called after the main scheduler self.scaled_lr_scheduler = ScaledLR( optimizer=self.optimizer, lr_scale=lr_scale, ) # Configure Training data loader self.train_loader = _create_train_dataloader( data_dir=data_dir, train_dir=train_dir, batch_size=self.batch_size, workers=workers, distributed=self.distributed, progressive_resize=progressive_resize, num_classes=num_classes, ) self.total_batches = len(self.train_loader) # Compute total steps required by the OneCycleLR if self.dynamic_batch_size is None: self.total_steps = len(self.train_loader) * self.epochs else: total_images = len(self.train_loader.dataset) # Initial batch size from_epoch = 0 batch_size = self.batch_size steps_per_epoch = -(-total_images // batch_size) self.total_steps = 0 milestones = sorted(self.dynamic_batch_size.keys()) for epoch in milestones[1:]: self.total_steps += steps_per_epoch * (epoch - from_epoch) batch_size = self.dynamic_batch_size[epoch] steps_per_epoch = -(-total_images // batch_size) from_epoch = epoch # Add last epochs self.total_steps += steps_per_epoch * (self.epochs - from_epoch) # Configure Validation data loader val_dir = config.get("val_dir", "val") val_batch_size = config.get("val_batch_size", self.batch_size) self.val_loader = _create_validation_dataloader( data_dir=data_dir, val_dir=val_dir, batch_size=val_batch_size, workers=workers, num_classes=num_classes, ) # Configure leaning rate scheduler lr_scheduler_class = config.get("lr_scheduler_class", None) if lr_scheduler_class is not None: lr_scheduler_args = config.get("lr_scheduler_args", {}) if self.rank == 0: self.logger.debug("LR Scheduler args:") self.logger.debug(pformat(lr_scheduler_args)) self.lr_scheduler = _create_lr_scheduler( optimizer=self.optimizer, lr_scheduler_class=lr_scheduler_class, lr_scheduler_args=lr_scheduler_args, total_steps=self.total_steps, steps_per_epoch=self.total_batches)
def __init__(self, config): """Called once at the beginning of each experiment.""" super(MNISTSparseExperiment, self).__init__() self.start_time = time.time() self.logger = get_logger(config["name"], config.get("verbose", 2)) self.logger.debug("Config: %s", config) # Setup random seed seed = config["seed"] set_random_seed(seed) self.data_dir = config["data_dir"] self.batch_size = config["batch_size"] self.test_batch_size = config["test_batch_size"] self.first_epoch_batch_size = config["first_epoch_batch_size"] self.validation = config.get("validation", 50000.0 / 60000.0) self.learning_rate_factor = config["learning_rate_factor"] self.lr_scheduler_params = config.get("lr_scheduler_params", None) self._configure_dataloaders() # Configure Model cnn_input_shape = config.get("cnn_input_shape", (1, 28, 28)) linear_n = config["linear_n"] linear_percent_on = config["linear_percent_on"] cnn_out_channels = config["cnn_out_channels"] cnn_percent_on = config["cnn_percent_on"] boost_strength = config["boost_strength"] weight_sparsity = config["weight_sparsity"] cnn_weight_sparsity = config["cnn_weight_sparsity"] boost_strength_factor = config["boost_strength_factor"] k_inference_factor = config["k_inference_factor"] use_batch_norm = config["use_batch_norm"] dropout = config.get("dropout", 0.0) model = nn.Sequential() # Add CNN Layers input_shape = cnn_input_shape cnn_layers = len(cnn_out_channels) if cnn_layers > 0: for i in range(cnn_layers): in_channels, height, width = input_shape add_sparse_cnn_layer( network=model, suffix=i + 1, in_channels=in_channels, out_channels=cnn_out_channels[i], use_batch_norm=use_batch_norm, weight_sparsity=cnn_weight_sparsity, percent_on=cnn_percent_on[i], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ) # Feed this layer output into next layer input in_channels = cnn_out_channels[i] # Compute next layer input shape wout = (width - 5) + 1 maxpool_width = wout // 2 input_shape = (in_channels, maxpool_width, maxpool_width) # Flatten CNN output before passing to linear layer model.add_module("flatten", Flatten()) # Add Linear layers input_size = np.prod(input_shape) for i in range(len(linear_n)): add_sparse_linear_layer( network=model, suffix=i + 1, input_size=input_size, linear_n=linear_n[i], dropout=dropout, use_batch_norm=False, weight_sparsity=weight_sparsity, percent_on=linear_percent_on[i], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ) input_size = linear_n[i] # Output layer model.add_module("output", nn.Linear(input_size, 10)) model.add_module("softmax", nn.LogSoftmax(dim=1)) if torch.cuda.is_available(): self.device = torch.device("cuda") model = model.cuda() else: self.device = torch.device("cpu") if torch.cuda.device_count() > 1: self.logger.debug("Using", torch.cuda.device_count(), "GPUs") model = torch.nn.DataParallel(model) self.model = model self.logger.debug("Model: %s", self.model) self.learning_rate = config["learning_rate"] self.momentum = config["momentum"] self.batches_in_epoch = config["batches_in_epoch"] self.batches_in_first_epoch = config["batches_in_first_epoch"] self.config = config self.optimizer = self._create_optimizer(name=config["optimizer"], model=self.model) self.lr_scheduler = self._create_learning_rate_scheduler( name=config.get("lr_scheduler", None), optimizer=self.optimizer)
def setup_experiment(self, config): """ Configure the experiment for training :param config: Dictionary containing the configuration parameters - distributed: Whether or not to use Pytorch Distributed training - backend: Pytorch Distributed backend ("nccl", "gloo") Default: nccl - world_size: Total number of processes participating - rank: Rank of the current process - data: Dataset path - train_dir: Dataset training data relative path - batch_size: Training batch size - val_dir: Dataset validation data relative path - val_batch_size: Validation batch size - workers: how many data loading processes to use - train_loader_drop_last: Whether to skip last batch if it is smaller than the batch size - num_classes: Limit the dataset size to the given number of classes - model_class: Model class. Must inherit from "torch.nn.Module" - model_args: model model class arguments passed to the constructor - init_batch_norm: Whether or not to Initialize running batch norm mean to 0. - optimizer_class: Optimizer class. Must inherit from "torch.optim.Optimizer" - optimizer_args: Optimizer class class arguments passed to the constructor - batch_norm_weight_decay: Whether or not to apply weight decay to batch norm modules parameters See https://arxiv.org/abs/1807.11205 - bias_weight_decay: Whether or not to apply weight decay to bias parameters - lr_scheduler_class: Learning rate scheduler class. Must inherit from "_LRScheduler" - lr_scheduler_args: Learning rate scheduler class class arguments passed to the constructor - lr_scheduler_step_every_batch: Whether to step the lr-scheduler after after every batch (e.g. for OneCycleLR) - loss_function: Loss function. See "torch.nn.functional" - local_dir: Results path - logdir: Directory generated by Ray Tune for this Trial - epochs: Number of epochs to train - batches_in_epoch: Number of batches per epoch. Useful for debugging - log_timestep_freq: Configures mixins and subclasses that log every timestep to only log every nth timestep (in addition to the final timestep of each epoch). Set to 0 to log only at the end of each epoch. - progress: Show progress during training - name: Experiment name. Used as logger name - log_level: Python Logging level - log_format: Python Logging format - seed: the seed to be used for pytorch, python, and numpy - mixed_precision: Whether or not to enable apex mixed precision - mixed_precision_args: apex mixed precision arguments. See "amp.initialize" - sample_transform: Transform acting on the training samples. To be used additively after default transform or auto-augment. - target_transform: Transform acting on the training targets. - replicas_per_sample: Number of replicas to create per sample in the batch. (each replica is transformed independently) Used in maxup. - train_model_func: Optional user defined function to train the model, expected to behave similarly to `train_model` in terms of input parameters and return values - evaluate_model_func: Optional user defined function to validate the model expected to behave similarly to `evaluate_model` in terms of input parameters and return values - checkpoint_file: if not None, will start from this model. The model must have the same model_args and model_class as the current experiment. - load_checkpoint_args: args to be passed to `load_state_from_checkpoint` - checkpoint_at_init: boolean argument for whether to create a checkpoint of the initialized model. this differs from `checkpoint_at_start` for which the checkpoint occurs after the first epoch of training as opposed to before it - epochs_to_validate: list of epochs to run validate(). A -1 asks to run validate before any training occurs. Default: last three epochs. - extra_validations_per_epoch: number of additional validations to perform mid-epoch. Additional validations are distributed evenly across training batches. - launch_time: time the config was created (via time.time). Used to report wall clock time until the first batch is done. Default: time.time() in this setup_experiment(). """ # Configure logging related stuff log_format = config.get("log_format", logging.BASIC_FORMAT) log_level = getattr(logging, config.get("log_level", "INFO").upper()) console = logging.StreamHandler() console.setFormatter(logging.Formatter(log_format)) self.logger = logging.getLogger(config.get("name", type(self).__name__)) self.logger.setLevel(log_level) self.logger.addHandler(console) self.progress = config.get("progress", False) self.launch_time = config.get("launch_time", time.time()) self.logdir = config.get("logdir", None) # Configure seed self.seed = config.get("seed", self.seed) set_random_seed(self.seed, False) # Configure distribute pytorch self.distributed = config.get("distributed", False) self.rank = config.get("rank", 0) if self.rank == 0: self.logger.info( f"Execution order: {pformat(self.get_execution_order())}") if self.distributed: dist_url = config.get("dist_url", "tcp://127.0.0.1:54321") backend = config.get("backend", "nccl") world_size = config.get("world_size", 1) dist.init_process_group( backend=backend, init_method=dist_url, rank=self.rank, world_size=world_size, ) # Only enable logs from first process self.logger.disabled = self.rank != 0 self.progress = self.progress and self.rank == 0 # Configure model self.device = config.get("device", self.device) self.model = self.create_model(config, self.device) self.transform_model() if self.rank == 0: self.logger.debug(self.model) # Configure optimizer group_decay, group_no_decay = [], [] for module in self.model.modules(): for name, param in module.named_parameters(recurse=False): if self.should_decay_parameter(module, name, param, config): group_decay.append(param) else: group_no_decay.append(param) optimizer_class = config.get("optimizer_class", torch.optim.SGD) optimizer_args = config.get("optimizer_args", {}) self.optimizer = optimizer_class([dict(params=group_decay), dict(params=group_no_decay, weight_decay=0.)], **optimizer_args) # Validate mixed precision requirements self.mixed_precision = config.get("mixed_precision", False) if self.mixed_precision and amp is None: self.mixed_precision = False self.logger.error( "Mixed precision requires NVIDA APEX." "Please install apex from https://www.github.com/nvidia/apex" "Disabling mixed precision training.") # Configure mixed precision training if self.mixed_precision: amp_args = config.get("mixed_precision_args", {}) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, **amp_args) self.logger.info("Using mixed precision") # Apply DistributedDataParallel after all other model mutations if self.distributed: self.model = DistributedDataParallel(self.model) else: self.model = DataParallel(self.model) self._loss_function = config.get( "loss_function", torch.nn.functional.cross_entropy ) self.num_classes = config.get("num_classes", 1000) self.epochs = config.get("epochs", 1) self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize) self.current_epoch = 0 # Get initial batch size self.batch_size = config.get("batch_size", 1) # CUDA runtime does not support the fork start method. # See https://pytorch.org/docs/stable/notes/multiprocessing.html multiprocessing.set_start_method("spawn", force=True) # Configure data loaders self.train_loader = self.create_train_dataloader(config) self.val_loader = self.create_validation_dataloader(config) self.total_batches = len(self.train_loader) self.epochs_to_validate = config.get("epochs_to_validate", range(self.epochs - 3, self.epochs + 1)) extra_validations = config.get("extra_validations_per_epoch", 0) batches_to_validate = np.linspace( min(self.total_batches, self.batches_in_epoch), 0, 1 + extra_validations, endpoint=False )[::-1].round().astype("int").tolist() self.additional_batches_to_validate = batches_to_validate[:-1] if extra_validations > 0: self.logger.info( f"Extra validations per epoch: {extra_validations}, " f"batch indices: {self.additional_batches_to_validate}") # Used for logging. Conceptually, it is a version number for the model's # parameters. By default, this is the elapsed number of batches that the # model has been trained on. Experiments may also increment this on # other events like model prunings. When validation is performed after a # training batch, the validation results are assigned to the next # timestep after that training batch, since it was performed on the # subsequent version of the parameters. self.current_timestep = 0 self.log_timestep_freq = config.get("log_timestep_freq", 1) # A list of [(timestep, result), ...] for the current epoch. self.extra_val_results = [] # Configure learning rate scheduler self.lr_scheduler = self.create_lr_scheduler( config, self.optimizer, self.total_batches) if self.lr_scheduler is not None: lr_scheduler_class = self.lr_scheduler.__class__.__name__ lr_scheduler_args = config.get("lr_scheduler_args", {}) self.logger.info("LR Scheduler class: " + lr_scheduler_class) self.logger.info("LR Scheduler args:") self.logger.info(pformat(lr_scheduler_args)) self.logger.info("steps_per_epoch=%s", self.total_batches) self.step_lr_every_batch = config.get("lr_scheduler_step_every_batch", False) if isinstance(self.lr_scheduler, (OneCycleLR, ComposedLRScheduler)): self.step_lr_every_batch = True # Set train and validate methods. self.train_model = config.get("train_model_func", train_model) self.evaluate_model = config.get("evaluate_model_func", evaluate_model)
import ray import torch from nupic.research.archive.dynamic_sparse.common.ray_custom_loggers import ( DEFAULT_LOGGERS, ) from nupic.research.archive.dynamic_sparse.common.utils import ( Trainable, new_experiment, run_experiment, ) from nupic.research.frameworks.pytorch.model_utils import set_random_seed # Set seed for `random`, `numpy`, and `pytorch`. set_random_seed(32) def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj # experiment configurations base_exp_config = dict(
def setup_experiment(self, config): """ Configure the experiment for training :param config: Dictionary containing the configuration parameters - distributed: Whether or not to use Pytorch Distributed training - backend: Pytorch Distributed backend ("nccl", "gloo") - world_size: Total number of processes participating - rank: Rank of the current process - data: Dataset path - train_dir: Dataset training data relative path - batch_size: Training batch size - val_dir: Dataset validation data relative path - val_batch_size: Validation batch size - workers: how many data loading processes to use - num_classes: Limit the dataset size to the given number of classes - model_class: Model class. Must inherit from "torch.nn.Module" - model_args: model model class arguments passed to the constructor - init_batch_norm: Whether or not to Initialize running batch norm mean to 0. - optimizer_class: Optimizer class. Must inherit from "torch.optim.Optimizer" - optimizer_args: Optimizer class class arguments passed to the constructor - batch_norm_weight_decay: Whether or not to apply weight decay to batch norm modules parameters - lr_scheduler_class: Learning rate scheduler class. Must inherit from "_LRScheduler" - lr_scheduler_args: Learning rate scheduler class class arguments passed to the constructor - loss_function: Loss function. See "torch.nn.functional" - local_dir: Results path - epochs: Number of epochs to train - batches_in_epoch: Number of batches per epoch. Useful for debugging - progress: Show progress during training - profile: Whether or not to enable torch.autograd.profiler.profile during training - name: Experiment name. Used as logger name - log_level: Python Logging level - log_format: Python Logging format - seed: the seed to be used for pytorch, python, and numpy - mixed_precision: Whether or not to enable apex mixed precision - mixed_precision_args: apex mixed precision arguments. See "amp.initialize" - init_hooks: list of hooks (functions) to call on the model just following its initialization - post_epoch_hooks: list of hooks (functions) to call on the model following each epoch of training - checkpoint_file: if not None, will start from this model. The model must have the same model_args and model_class as the current experiment. - validate_after_epoch: will only run validate after this epoch. Default: epochs - 3 """ # Configure logger log_format = config.get("log_format", logging.BASIC_FORMAT) log_level = getattr(logging, config.get("log_level", "INFO").upper()) console = logging.StreamHandler() console.setFormatter(logging.Formatter(log_format)) self.logger = logging.getLogger(config.get("name", type(self).__name__)) self.logger.setLevel(log_level) self.logger.addHandler(console) self.progress = config.get("progress", False) # Configure seed self.seed = config.get("seed", self.seed) set_random_seed(self.seed, False) # Configure distribute pytorch self.distributed = config.get("distributed", False) self.rank = config.get("rank", 0) if self.distributed: dist_url = config.get("dist_url", "tcp://127.0.0.1:54321") backend = config.get("backend", "nccl") world_size = config.get("world_size", 1) dist.init_process_group( backend=backend, init_method=dist_url, rank=self.rank, world_size=world_size, ) # Only enable logs from first process self.logger.disabled = self.rank != 0 self.progress = self.progress and self.rank == 0 # Configure model model_class = config["model_class"] model_args = config.get("model_args", {}) init_batch_norm = config.get("init_batch_norm", False) init_hooks = config.get("init_hooks", None) self.model = create_model(model_class=model_class, model_args=model_args, init_batch_norm=init_batch_norm, device=self.device, init_hooks=init_hooks, checkpoint_file=config.get( "checkpoint_file", None)) if self.rank == 0: self.logger.debug(self.model) params_sparse, nonzero_params_sparse2 = count_nonzero_params( self.model) self.logger.debug("Params total/nnz %s / %s = %s ", params_sparse, nonzero_params_sparse2, float(nonzero_params_sparse2) / params_sparse) # Configure optimizer optimizer_class = config.get("optimizer_class", torch.optim.SGD) optimizer_args = config.get("optimizer_args", {}) batch_norm_weight_decay = config.get("batch_norm_weight_decay", True) self.optimizer = create_optimizer( model=self.model, optimizer_class=optimizer_class, optimizer_args=optimizer_args, batch_norm_weight_decay=batch_norm_weight_decay, ) # Validate mixed precision requirements self.mixed_precision = config.get("mixed_precision", False) if self.mixed_precision and amp is None: self.mixed_precision = False self.logger.error( "Mixed precision requires NVIDA APEX." "Please install apex from https://www.github.com/nvidia/apex" "Disabling mixed precision training.") # Configure mixed precision training if self.mixed_precision: amp_args = config.get("mixed_precision_args", {}) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, **amp_args) self.logger.info("Using mixed precision") # Apply DistributedDataParallel after all other model mutations if self.distributed: self.model = DistributedDataParallel(self.model) else: self.model = DataParallel(self.model) self.loss_function = config.get("loss_function", torch.nn.functional.cross_entropy) # Configure data loaders self.epochs = config.get("epochs", 1) self.batches_in_epoch = config.get("batches_in_epoch", sys.maxsize) self.validate_after_epoch = config.get("validate_after_epoch", self.epochs - 3) workers = config.get("workers", 0) data_dir = config["data"] train_dir = config.get("train_dir", "train") num_classes = config.get("num_classes", 1000) # Get initial batch size self.batch_size = config.get("batch_size", 1) # CUDA runtime does not support the fork start method. # See https://pytorch.org/docs/stable/notes/multiprocessing.html if torch.cuda.is_available(): multiprocessing.set_start_method("spawn") # Configure Training data loader self.train_loader = create_train_dataloader( data_dir=data_dir, train_dir=train_dir, batch_size=self.batch_size, workers=workers, distributed=self.distributed, num_classes=num_classes, use_auto_augment=config.get("use_auto_augment", False), ) self.total_batches = len(self.train_loader) # Configure Validation data loader val_dir = config.get("val_dir", "val") val_batch_size = config.get("val_batch_size", self.batch_size) self.val_loader = create_validation_dataloader( data_dir=data_dir, val_dir=val_dir, batch_size=val_batch_size, workers=workers, num_classes=num_classes, ) # Configure leaning rate scheduler lr_scheduler_class = config.get("lr_scheduler_class", None) if lr_scheduler_class is not None: lr_scheduler_args = config.get("lr_scheduler_args", {}) self.logger.debug("LR Scheduler args:") self.logger.debug(pformat(lr_scheduler_args)) self.lr_scheduler = create_lr_scheduler( optimizer=self.optimizer, lr_scheduler_class=lr_scheduler_class, lr_scheduler_args=lr_scheduler_args, steps_per_epoch=self.total_batches) # Only profile from rank 0 self.profile = config.get("profile", False) and self.rank == 0 # Register post-epoch hooks. To be used as `self.model.apply(post_epoch_hook)` self.post_epoch_hooks = config.get("post_epoch_hooks", [])