def test_createall(self): for type_file in self.model_type_files: associated_task = type_file.split("/")[-2] models_config = OmegaConf.load(type_file) models_config = OmegaConf.merge(models_config, self.data_config) models_config.update("data.task", associated_task) for model_name in models_config.models.keys(): with self.subTest(model_name): if model_name not in ["MinkUNet_WIP"]: models_config.update("model_name", model_name) instantiate_model(models_config, MockDatasetGeometric(6))
def validate(self, dataset_config): """ A checkpoint is considered as valid if it can recreate the model from a dataset config only """ if dataset_config is not None: for k, v in dataset_config.items(): self.data_config[k] = v try: instantiate_model(self.run_config, self.data_config) except: return False return True
def test_accumulated_gradient(self): params = load_model_config("segmentation", "pointnet2", "pointnet2ms") config_training = OmegaConf.load( os.path.join(DIR, "test_config/training_config.yaml")) dataset = MockDatasetGeometric(5) model = instantiate_model(params, dataset) model.instantiate_optimizers(config_training) model.set_input(dataset[0], "cpu") expected_make_optimizer_step = [ False, False, True, False, False, True, False, False, True, False ] expected_contains_grads = [ False, True, True, False, True, True, False, True, True, False ] make_optimizer_steps = [] contains_grads = [] for epoch in range(10): model.forward() make_optimizer_step = model._manage_optimizer_zero_grad( ) # Accumulate gradient if option is up make_optimizer_steps.append(make_optimizer_step) grad_ = model._modules["lin1"].weight.grad if grad_ is not None: contains_grads.append((grad_.sum() != 0).item()) else: contains_grads.append(False) model.backward() # calculate gradients if make_optimizer_step: model._optimizer.step() # update parameters self.assertEqual(contains_grads, expected_contains_grads) self.assertEqual(make_optimizer_steps, expected_make_optimizer_step)
def test_runall(self): def is_known_to_fail(model_name): forward_failing = [ "MinkUNet_WIP", "pointcnn", "RSConv_4LD", "RSConv_2LD", "randlanet" ] for failing in forward_failing: if failing.lower() in model_name.lower(): return True return False def get_dataset(conv_type, task): features = 2 if task == "registration": if conv_type.lower() == "dense": return PairMockDataset(features, num_points=2048) if conv_type.lower() == "sparse": tr = Compose([ XYZFeature(True, True, True), GridSampling(size=0.01, quantize_coords=True, mode="last") ]) return PairMockDatasetGeometric(features, transform=tr, num_points=1024) return PairMockDatasetGeometric(features) else: if conv_type.lower() == "dense": return MockDataset(features, num_points=2048) if conv_type.lower() == "sparse": return MockDatasetGeometric(features, transform=GridSampling( size=0.01, quantize_coords=True, mode="last"), num_points=1024) return MockDatasetGeometric(features) for type_file in self.model_type_files: associated_task = type_file.split("/")[-2] models_config = OmegaConf.load(type_file) models_config = OmegaConf.merge(models_config, self.data_config) models_config.update("data.task", associated_task) for model_name in models_config.models.keys(): with self.subTest(model_name): if not is_known_to_fail(model_name): models_config.update("model_name", model_name) dataset = get_dataset( models_config.models[model_name].conv_type, associated_task) model = instantiate_model(models_config, dataset) model.set_input(dataset[0], device) try: model.forward() model.backward() except Exception as e: print("Model failing:") print(model) raise e
def test_pointnet2ms(self): params = load_model_config("segmentation", "pointnet2", "pointnet2ms") dataset = MockDatasetGeometric(5) model = instantiate_model(params, dataset) model.set_input(dataset[0], device) model.forward() model.backward()
def main(cfg): OmegaConf.set_struct(cfg, False) # Get device device = torch.device("cuda" if (torch.cuda.is_available() and cfg.training.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.training.enable_cudnn # Checkpoint checkpoint = ModelCheckpoint(cfg.training.checkpoint_dir, cfg.model_name, cfg.training.weight_name, strict=True) # Setup the dataset config # Generic config dataset = instantiate_dataset(cfg.data) if not checkpoint.is_empty: model = checkpoint.create_model(dataset, weight_name=cfg.training.weight_name) else: log.info("No Checkpoint for this model") model = instantiate_model(copy.deepcopy(cfg), dataset) model.set_pretrained_weights() log.info(model) log.info("Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) log.info(dataset) model.eval() if cfg.enable_dropout: model.enable_dropout_in_eval() model = model.to(device) run(model, dataset, device, cfg)
def test_model_ckpt_using_pointnet2ms(self,): # Create a checkpt self.run_path = os.path.join(DIR, "checkpt") if not os.path.exists(self.run_path): os.makedirs(self.run_path) model_checkpoint = ModelCheckpoint(self.run_path, self.model_name, "test", run_config=self.config, resume=False) dataset = MockDatasetGeometric(5) model = instantiate_model(self.config, dataset) model.set_input(dataset[0], "cpu") model.instantiate_optimizers(self.config) mock_metrics = {"current_metrics": {"acc": 12}, "stage": "test", "epoch": 10} model_checkpoint.save_best_models_under_current_metrics(model, mock_metrics) # Load checkpoint and initialize model model_checkpoint = ModelCheckpoint(self.run_path, self.model_name, "test", self.config, resume=True) model2 = model_checkpoint.create_model(dataset, weight_name="acc") self.assertEqual(str(model.optimizer.__class__.__name__), str(model2.optimizer.__class__.__name__)) self.assertEqual(model.optimizer.defaults, model2.optimizer.defaults) self.assertEqual(model.schedulers["lr_scheduler"].state_dict(), model2.schedulers["lr_scheduler"].state_dict()) self.assertEqual(model.schedulers["bn_scheduler"].state_dict(), model2.schedulers["bn_scheduler"].state_dict()) remove(os.path.join(ROOT, "{}.pt".format(self.model_name))) remove(os.path.join(DIR, "{}.pt".format(self.model_name)))
def test_runall(self): def is_known_to_fail(model_name): forward_failing = [ "MinkUNet_WIP", "pointcnn", "RSConv_4LD", "RSConv_2LD", "randlanet", "ResUNet32", "Res16UNet34", ] if not HAS_MINKOWSKI: forward_failing += ["Res16", "MinkUNet", "ResUNetBN2B"] for failing in forward_failing: if failing.lower() in model_name.lower(): return True return False for type_file in self.model_type_files: associated_task = os.path.normpath(type_file).split( os.path.sep)[-2] models_config = OmegaConf.load(type_file) models_config = OmegaConf.merge(models_config, self.data_config) models_config.update("data.task", associated_task) models_config.update("data.grid_size", 0.05) for model_name in models_config.models.keys(): with self.subTest(model_name): if not is_known_to_fail(model_name): models_config.update("model_name", model_name) dataset = get_dataset( models_config.models[model_name].conv_type, associated_task) try: model = instantiate_model(models_config, dataset) except Exception as e: print(e) raise Exception(models_config) model.set_input(dataset[0], device) try: model.forward() model.backward() except Exception as e: print("Forward or backward failing") raise e try: if has_zero_grad(model_name): ratio = 1 else: ratio = test_hasgrad(model) if ratio < 1: print( "Model %s.%s.%s has %i%% of parameters with 0 gradient" % (associated_task, type_file.split("/")[-1][:-5], model_name, 100 * ratio)) except Exception as e: print("Model with zero gradient %s: %s" % (type_file, model_name)) raise e
def test_one_model(self): # Use this test to test any model when debugging config = load_model_config("object_detection", "votenet2", "VoteNetRSConvSmall") dataset = get_dataset("dense", "object_detection") model = instantiate_model(config, dataset) model.set_input(dataset[0], device) model.forward() model.backward()
def create_model(self, dataset, weight_name=Checkpoint._LATEST): if not self.is_empty: run_config = copy.deepcopy(self._checkpoint.run_config) model = instantiate_model(run_config, dataset) self._initialize_model(model, weight_name) return model else: raise ValueError("Checkpoint is empty")
def re_instantiate_model(self, dataset): """This function is called to gather used_properties from dataset and saved within the checkpoint. Then try to create the model directly from configuration file""" used_properties = dataset.used_properties if used_properties is not None: for k, v in used_properties.items(): self.data_config[k] = v return instantiate_model(self.run_config, self.data_config)
def test_largekpconv(self): params = load_model_config("segmentation", "kpconv", "KPConvPaper") params.update("data.use_category", True) params.update("data.first_subsampling", 0.02) dataset = MockDatasetGeometric(5) model = instantiate_model(params, dataset) model.set_input(dataset[0], device) model.forward() model.backward()
def test_one_model(self): # Use this test to test any model when debugging config = load_model_config("segmentation", "kpconv", "KPConvPaper") config.update("data.first_subsampling", 0.05) config.update("data.use_category", False) dataset = get_dataset("partial_dense", "segmentation") model = instantiate_model(config, dataset) print(model) model.set_input(dataset[0], device) model.forward() model.backward()
def test_kpconvpretransform(self): params = load_model_config("segmentation", "kpconv", "SimpleKPConv") dataset = MockDatasetGeometric(5) model = instantiate_model(params, dataset) model.eval() dataset_transform = MockDatasetGeometric(5) dataset_transform.set_strategies(model) model.set_input(dataset[0], device) model.forward() model.get_output() torch.testing.assert_allclose(dataset_transform[0].pos, dataset[0].pos)
def test_largekpconv(self): params = load_model_config("segmentation", "kpconv", "KPConvPaper") params.update("data.use_category", True) params.update("data.first_subsampling", 0.02) dataset = MockDatasetGeometric(5) model = instantiate_model(params, dataset) model.set_input(dataset[0], device) model.forward() model.backward() ratio = test_hasgrad(model) if ratio < 1: print("Model segmentation.kpconv.KPConvPaper has %i%% of parameters with 0 gradient" % (100 * ratio))
def create_model(self, dataset, weight_name=Checkpoint._LATEST): if not self.is_empty: run_config = copy.deepcopy(self._checkpoint.run_config) model = instantiate_model(run_config, dataset) if hasattr(self._checkpoint, "model_props"): for k, v in self._checkpoint.model_props.items(): setattr(model, k, v) delattr(self._checkpoint, "model_props") self._initialize_model(model, weight_name) return model else: raise ValueError("Checkpoint is empty")
def test_pointnet2ms(self): params = load_model_config("segmentation", "pointnet2", "pointnet2_largemsg") params.update("data.use_category", True) dataset = MockDataset(5, num_points=2048) model = instantiate_model(params, dataset) model.set_input(dataset[0], device) model.forward() model.backward() ratio = test_hasgrad(model) if ratio < 1: print( "Model segmentation.pointnet2.pointnet2_largemsgs has %i%% of parameters with 0 gradient" % (100 * ratio) )
def test_siamese_minkowski(self): params = load_model_config("registration", "minkowski", "MinkUNet_Fragment") transform = Compose([ XYZFeature(True, True, True), GridSampling(size=0.01, quantize_coords=True, mode="last") ]) dataset = PairMockDatasetGeometric(5, transform=transform, num_points=1024, is_pair_ind=True) model = instantiate_model(params, dataset) d = dataset[0] model.set_input(d, device) model.forward() model.backward()
def test_siamese_minkowski(self): params = load_model_config("registration", "minkowski", "MinkUNet_Fragment") transform = Compose( [XYZFeature(True, True, True), GridSampling3D(size=0.01, quantize_coords=True, mode="last")] ) dataset = PairMockDatasetGeometric(5, transform=transform, num_points=1024, is_pair_ind=True) model = instantiate_model(params, dataset) d = dataset[0] model.set_input(d, device) model.forward() model.backward() ratio = test_hasgrad(model) if ratio < 1: print( "Model registration.minkowski.MinkUNet_Fragment has %i%% of parameters with 0 gradient" % (100 * ratio) )
def main(cfg): OmegaConf.set_struct( cfg, False) # This allows getattr and hasattr methods to function correctly if cfg.pretty_print: print(cfg.pretty()) set_debugging_vars_to_global(cfg.debugging) # Get device device = torch.device("cuda" if ( torch.cuda.is_available() and cfg.training.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.training.enable_cudnn dataset = instantiate_dataset(cfg.data) model = instantiate_model(cfg, dataset) log.info(model) log.info( "Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) # Set dataloaders dataset.create_dataloaders( model, cfg.training.batch_size, cfg.training.shuffle, cfg.training.num_workers, cfg.training.precompute_multi_scale, ) log.info(dataset) # Run training / evaluation model = model.to(device) measurement_name = "{}_{}".format(cfg.model_name, dataset.__class__.__name__) run(cfg, model, dataset, device, measurement_name)
def _initialize_trainer(self): # Enable CUDNN BACKEND torch.backends.cudnn.enabled = self.enable_cudnn if not self.has_training: self._cfg.training = self._cfg resume = bool(self._cfg.checkpoint_dir) else: resume = bool(self._cfg.training.checkpoint_dir) # Get device if self._cfg.training.cuda > -1 and torch.cuda.is_available(): device = "cuda" torch.cuda.set_device(self._cfg.training.cuda) else: device = "cpu" self._device = torch.device(device) log.info("DEVICE : {}".format(self._device)) # Profiling if self.profiling: # Set the num_workers as torch.utils.bottleneck doesn't work well with it self._cfg.training.num_workers = 0 # Start Wandb if public if self.wandb_log: Wandb.launch(self._cfg, self._cfg.wandb.public and self.wandb_log) # Checkpoint self._checkpoint: ModelCheckpoint = ModelCheckpoint( self._cfg.training.checkpoint_dir, self._cfg.model_name, self._cfg.training.weight_name, run_config=self._cfg, resume=resume, ) # Create model and datasets if not self._checkpoint.is_empty: self._dataset: BaseDataset = instantiate_dataset( self._checkpoint.data_config) self._model: BaseModel = self._checkpoint.create_model( self._dataset, weight_name=self._cfg.training.weight_name) else: self._dataset: BaseDataset = instantiate_dataset(self._cfg.data) self._model: BaseModel = instantiate_model( copy.deepcopy(self._cfg), self._dataset) self._model.instantiate_optimizers(self._cfg, "cuda" in device) self._model.set_pretrained_weights() if not self._checkpoint.validate(self._dataset.used_properties): log.warning( "The model will not be able to be used from pretrained weights without the corresponding dataset. Current properties are {}" .format(self._dataset.used_properties)) self._checkpoint.dataset_properties = self._dataset.used_properties log.info(self._model) self._model.log_optimizers() log.info( "Model size = %i", sum(param.numel() for param in self._model.parameters() if param.requires_grad)) # Set dataloaders self._dataset.create_dataloaders( self._model, self._cfg.training.batch_size, self._cfg.training.shuffle, self._cfg.training.num_workers, self.precompute_multi_scale, ) log.info(self._dataset) # Verify attributes in dataset self._model.verify_data(self._dataset.train_dataset[0]) # Choose selection stage selection_stage = getattr(self._cfg, "selection_stage", "") self._checkpoint.selection_stage = self._dataset.resolve_saving_stage( selection_stage) self._tracker: BaseTracker = self._dataset.get_tracker( self.wandb_log, self.tensorboard_log) if self.wandb_log: Wandb.launch(self._cfg, not self._cfg.wandb.public and self.wandb_log) # Run training / evaluation self._model = self._model.to(self._device) if self.has_visualization: self._visualizer = Visualizer(self._cfg.visualization, self._dataset.num_batches, self._dataset.batch_size, os.getcwd())
def test_runall(self): def is_known_to_fail(model_name): forward_failing = [ "path_pretrained", "MinkUNet_WIP", "pointcnn", "RSConv_4LD", "RSConv_2LD", "randlanet", "PVCNN", "ResUNet32", ] if not HAS_MINKOWSKI: forward_failing += [ "Res16", "MinkUNet", "ResUNetBN2B", "ResUNet32", "Res16UNet34" ] for cm in [2, 4, 6]: for h in [1, 2, 3, 4]: for s in ["", "_unshared"]: forward_failing += [ "MS_SVCONV_B{}cm_X2_{}head{}".format(cm, h, s) ] for failing in forward_failing: if failing.lower() in model_name.lower(): return True return False def is_torch_sparse_backend(model_name): torchsparse_backend = [ "ResUNet32", "Res16UNet34", ] for cm in [2, 4, 6]: for h in [1, 2, 3, 4]: for s in ["", "_unshared"]: torchsparse_backend += [ "MS_SVCONV_B{}cm_X2_{}head{}".format(cm, h, s) ] for backend in torchsparse_backend: if backend.lower() in model_name.lower(): return True return False for type_file in self.model_type_files: associated_task = os.path.normpath(type_file).split( os.path.sep)[-2] # models_config = OmegaConf.load(type_file) models_config = OmegaConf.create( {"models": OmegaConf.load(type_file)}) models_config = OmegaConf.merge(models_config, self.data_config) # Update to OmegaConf 2.0 if omegaconf.__version__ == '1.4.1': models_config.update("data.task", associated_task) models_config.update("data.grid_size", 0.05) else: OmegaConf.update(models_config, "data.task", associated_task, merge=True) OmegaConf.update(models_config, "data.grid_size", 0.05, merge=True) models = models_config.get("models") models_keys = models.keys() if models is not None else [] for model_name in models_keys: if model_name == 'defaults': # workaround for recursive defaults continue with self.subTest(model_name): if not is_known_to_fail(model_name): if omegaconf.__version__ == '1.4.1': models_config.update("model_name", model_name) else: OmegaConf.update(models_config, "model_name", model_name, merge=True) # modify the backend in minkowski to have the forward if is_torch_sparse_backend(model_name): models_config.models[ model_name].backend = "minkowski" dataset = get_dataset( models_config.models[model_name].conv_type, associated_task) try: model = instantiate_model(models_config, dataset) except Exception as e: print(e) raise Exception(models_config) model.set_input(dataset[0], device) try: model.forward() model.backward() except Exception as e: print("Forward or backward failing") raise e try: if has_zero_grad(model_name): ratio = 1 else: ratio = test_hasgrad(model) if ratio < 1: print( "Model %s.%s.%s has %i%% of parameters with 0 gradient" % (associated_task, type_file.split("/")[-1][:-5], model_name, 100 * ratio)) except Exception as e: print("Model with zero gradient %s: %s" % (type_file, model_name)) raise e
def main(cfg): OmegaConf.set_struct( cfg, False) # This allows getattr and hasattr methods to function correctly if cfg.pretty_print: print(cfg.pretty()) # Get device device = torch.device("cuda" if ( torch.cuda.is_available() and cfg.training.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.training.enable_cudnn # Profiling profiling = getattr(cfg.debugging, "profiling", False) if profiling: # Set the num_workers as torch.utils.bottleneck doesn't work well with it cfg.training.num_workers = 0 # Start Wandb if public launch_wandb(cfg, cfg.wandb.public and cfg.wandb.log) # Checkpoint checkpoint = ModelCheckpoint( cfg.training.checkpoint_dir, cfg.model_name, cfg.training.weight_name, run_config=cfg, resume=bool(cfg.training.checkpoint_dir), ) # Create model and datasets if not checkpoint.is_empty: dataset = instantiate_dataset(checkpoint.data_config) model = checkpoint.create_model(dataset, weight_name=cfg.training.weight_name) else: dataset = instantiate_dataset(cfg.data) model = instantiate_model(cfg, dataset) model.instantiate_optimizers(cfg) log.info(model) model.log_optimizers() log.info( "Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) # Set dataloaders dataset.create_dataloaders( model, cfg.training.batch_size, cfg.training.shuffle, cfg.training.num_workers, cfg.training.precompute_multi_scale, ) log.info(dataset) # Choose selection stage selection_stage = getattr(cfg, "selection_stage", "") checkpoint.selection_stage = dataset.resolve_saving_stage(selection_stage) tracker: BaseTracker = dataset.get_tracker(model, dataset, cfg.wandb.log, cfg.tensorboard.log) launch_wandb(cfg, not cfg.wandb.public and cfg.wandb.log) # Run training / evaluation model = model.to(device) visualizer = Visualizer(cfg.visualization, dataset.num_batches, dataset.batch_size, os.getcwd()) run(cfg, model, dataset, device, tracker, checkpoint, visualizer) # https://github.com/facebookresearch/hydra/issues/440 hydra._internal.hydra.GlobalHydra.get_state().clear() return 0