def main(local_rank, c10d_backend, rdzv_init_url, max_world_size, classy_args): torch.manual_seed(0) set_video_backend(classy_args.video_backend) # Loads config, sets up task config = load_json(classy_args.config_file) task = build_task(config) # Load checkpoint, if available checkpoint = load_checkpoint(classy_args.checkpoint_folder) task.set_checkpoint(checkpoint) pretrained_checkpoint = load_checkpoint(classy_args.pretrained_checkpoint_folder) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) hooks = [ LossLrMeterLoggingHook(classy_args.log_freq), ModelComplexityHook(), TimeMetricsHook(), ] if classy_args.checkpoint_folder != "": args_dict = vars(classy_args) args_dict["config"] = config hooks.append( CheckpointHook( classy_args.checkpoint_folder, args_dict, checkpoint_period=classy_args.checkpoint_period, ) ) if classy_args.profiler: hooks.append(ProfilerHook()) task.set_hooks(hooks) assert c10d_backend == Backend.NCCL or c10d_backend == Backend.GLOO if c10d_backend == torch.distributed.Backend.NCCL: # needed to enable NCCL error handling os.environ["NCCL_BLOCKING_WAIT"] = "1" coordinator = CoordinatorP2P( c10d_backend=c10d_backend, init_method=rdzv_init_url, max_num_trainers=max_world_size, process_group_timeout=60000, ) trainer = ElasticTrainer( use_gpu=classy_args.device == "gpu", num_dataloader_workers=classy_args.num_workers, local_rank=local_rank, elastic_coordinator=coordinator, input_args={}, ) trainer.train(task)
def test_save_and_load_checkpoint(self): checkpoint_dict = {str(i): i * 2 for i in range(1000)} # save to the default checkpoint file save_checkpoint(self.base_dir, checkpoint_dict) # load the checkpoint by using the default file loaded_checkpoint = load_checkpoint(self.base_dir) self.assertDictEqual(checkpoint_dict, loaded_checkpoint) # load the checkpoint by passing the full path checkpoint_path = f"{self.base_dir}/{CHECKPOINT_FILE}" loaded_checkpoint = load_checkpoint(checkpoint_path) self.assertDictEqual(checkpoint_dict, loaded_checkpoint) # create a new checkpoint dict filename = "my_checkpoint.torch" checkpoint_dict = {str(i): i * 3 for i in range(1000)} # save the checkpoint to a different file save_checkpoint(self.base_dir, checkpoint_dict, checkpoint_file=filename) # load the checkpoint by passing the full path checkpoint_path = f"{self.base_dir}/{filename}" loaded_checkpoint = load_checkpoint(checkpoint_path) self.assertDictEqual(checkpoint_dict, loaded_checkpoint)
def test_state_checkpointing(self) -> None: """ Test that the state gets checkpointed without any errors, but only on the right phase_type and only if the checkpoint directory exists. """ config = get_test_task_config() task = build_task(config) task.prepare() local_variables = {} checkpoint_folder = self.base_dir + "/checkpoint_end_test/" input_args = {"foo": "bar"} # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, input_args, phase_types=["train"]) # checkpoint directory doesn't exist # call the on start function with self.assertRaises(FileNotFoundError): checkpoint_hook.on_start(task) # call the on end phase function with self.assertRaises(AssertionError): checkpoint_hook.on_phase_end(task, local_variables) # try loading a non-existent checkpoint checkpoint = load_checkpoint(checkpoint_folder) self.assertIsNone(checkpoint) # create checkpoint dir, verify on_start hook runs os.mkdir(checkpoint_folder) checkpoint_hook.on_start(task) # Phase_type is test, expect no checkpoint task.train = False # call the on end phase function checkpoint_hook.on_phase_end(task, local_variables) checkpoint = load_checkpoint(checkpoint_folder) self.assertIsNone(checkpoint) task.train = True # call the on end phase function checkpoint_hook.on_phase_end(task, local_variables) # model should be checkpointed. load and compare checkpoint = load_checkpoint(checkpoint_folder) self.assertIsNotNone(checkpoint) for key in ["input_args", "classy_state_dict"]: self.assertIn(key, checkpoint) # not testing for equality of classy_state_dict, that is tested in # a separate test self.assertDictEqual(checkpoint["input_args"], input_args)
def test_checkpoint_period(self) -> None: """ Test that the checkpoint_period works as expected. """ config = get_test_task_config() task = build_task(config) task.prepare() local_variables = {} checkpoint_folder = self.base_dir + "/checkpoint_end_test/" checkpoint_period = 10 for phase_types in [["train"], ["train", "test"]]: # create a checkpoint hook checkpoint_hook = CheckpointHook( checkpoint_folder, {}, phase_types=phase_types, checkpoint_period=checkpoint_period, ) # create checkpoint dir os.mkdir(checkpoint_folder) # call the on start function checkpoint_hook.on_start(task) # shouldn't create any checkpoints until there are checkpoint_period # phases which are in phase_types count = 0 valid_phase_count = 0 while valid_phase_count < checkpoint_period - 1: task.train = count % 2 == 0 # call the on end phase function checkpoint_hook.on_phase_end(task, local_variables) checkpoint = load_checkpoint(checkpoint_folder) self.assertIsNone(checkpoint) valid_phase_count += 1 if task.phase_type in phase_types else 0 count += 1 # create a phase which is in phase_types task.train = True # call the on end phase function checkpoint_hook.on_phase_end(task, local_variables) # model should be checkpointed. load and compare checkpoint = load_checkpoint(checkpoint_folder) self.assertIsNotNone(checkpoint) # delete the checkpoint dir shutil.rmtree(checkpoint_folder)
def main(args, config): # Global flags torch.manual_seed(0) set_image_backend(args.image_backend) set_video_backend(args.video_backend) task = build_task(config) # Load checkpoint, if available. checkpoint = load_checkpoint(args.checkpoint_load_path) task.set_checkpoint(checkpoint) # Load a checkpoint contraining a pre-trained model. This is how we # implement fine-tuning of existing models. pretrained_checkpoint = load_checkpoint(args.pretrained_checkpoint_path) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) # Configure hooks to do tensorboard logging, checkpoints and so on task.set_hooks(configure_hooks(args, config)) use_gpu = None if args.device is not None: use_gpu = args.device == "gpu" assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable" # LocalTrainer is used for a single node. DistributedTrainer will setup # training to use PyTorch's DistributedDataParallel. trainer_class = { "none": LocalTrainer, "ddp": DistributedTrainer }[args.distributed_backend] trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers) logging.info(f"Starting training on rank {get_rank()} worker. " f"World size is {get_world_size()}") # That's it! When this call returns, training is done. trainer.train(task) output_folder = Path(args.checkpoint_folder).resolve() logging.info("Training successful!") logging.info( f'Results of this training run are available at: "{output_folder}"')
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) use_gpu = torch.cuda.is_available() trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task) # load the final train checkpoint checkpoint = load_checkpoint(self.base_dir) # make sure fetching the where raises an exception, which means that # where is >= 1.0 with self.assertRaises(Exception): task.where # set task_2's state as task's final train checkpoint task_2.set_checkpoint(checkpoint) task_2.prepare(use_gpu=use_gpu) # we should be able to train the task trainer.train(task_2)
def test_checkpointing(self): # make checkpoint directory checkpoint_folder = self.base_dir + "/checkpoint/" os.mkdir(checkpoint_folder) config = get_fast_test_task_config() cuda_available = torch.cuda.is_available() task = build_task(config) task.prepare(use_gpu=cuda_available) # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, {}, phase_types=["train"]) # call the on end phase function checkpoint_hook.on_phase_end(task) # we should be able to train a task using the checkpoint on all available # devices for use_gpu in {False, cuda_available}: # load the checkpoint checkpoint = load_checkpoint(checkpoint_folder) # create a new task task = build_task(config) # set the checkpoint task.set_checkpoint(checkpoint) task.prepare(use_gpu=use_gpu) # we should be able to run the trainer using the checkpoint trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task)
def test_from_checkpoint(self): config = get_test_task_config() for use_head in [True, False]: config["model"] = self.get_model_config(use_head) task = build_task(config) task.prepare() checkpoint_folder = f"{self.base_dir}/{use_head}/" input_args = {"config": config} # Simulate training by setting the model parameters to zero for param in task.model.parameters(): param.data.zero_() checkpoint_hook = CheckpointHook( checkpoint_folder, input_args, phase_types=["train"] ) # Create checkpoint dir, save checkpoint os.mkdir(checkpoint_folder) checkpoint_hook.on_start(task) task.train = True checkpoint_hook.on_phase_end(task) # Model should be checkpointed. load and compare checkpoint = load_checkpoint(checkpoint_folder) model = ClassyModel.from_checkpoint(checkpoint) self.assertTrue(isinstance(model, MyTestModel)) # All parameters must be zero for param in model.parameters(): self.assertTrue(torch.all(param.data == 0))
def build_retrieval_model(cfg): """ Builds the model on 1-gpu and initializes from the weight. """ logging.info("Building model....") model = build_model(cfg.MODEL, cfg.OPTIMIZER) if g_pathmgr.exists(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE): init_weights_path = cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE logging.info(f"Initializing model from: {init_weights_path}") weights = load_checkpoint(init_weights_path, device=torch.device("cuda")) skip_layers = cfg.MODEL.WEIGHTS_INIT.get("SKIP_LAYERS", []) replace_prefix = cfg.MODEL.WEIGHTS_INIT.get("REMOVE_PREFIX", None) append_prefix = cfg.MODEL.WEIGHTS_INIT.get("APPEND_PREFIX", None) state_dict_key_name = cfg.MODEL.WEIGHTS_INIT.get( "STATE_DICT_KEY_NAME", None) init_model_from_consolidated_weights( cfg, model, weights, state_dict_key_name=state_dict_key_name, skip_layers=skip_layers, replace_prefix=replace_prefix, append_prefix=append_prefix, ) else: # We only throw the warning if not weights file is provided. We want to # benchmark the random initialization model too and hence support that. logging.warning("Model is randomly initialized....") logging.info(f"Model is:\n {model}") return model
def load_and_broadcast_checkpoint( cls, checkpoint_folder: str, checkpoint_path: str, device ): """ Load the checkpoint at the provided path, dealing with the potential indirection due to the notion of sharded checkpoint """ checkpoint = load_and_broadcast_checkpoint(checkpoint_path, device) if cls._is_shard_aggregator_checkpoint(checkpoint): _, global_rank = get_machine_local_and_dist_rank() shard_name = checkpoint["shards"][global_rank] shard_path = os.path.join(checkpoint_folder, shard_name) checkpoint = load_checkpoint(shard_path, device) return checkpoint
def from_config(cls, config: Dict[str, Any]) -> "FineTuningTask": """Instantiates a FineTuningTask from a configuration. Args: config: A configuration for a FineTuningTask. See :func:`__init__` for parameters expected in the config. Returns: A FineTuningTask instance. """ task = super().from_config(config) pretrained_checkpoint = load_checkpoint(config.get("pretrained_checkpoint")) if pretrained_checkpoint is not None: task.set_pretrained_checkpoint(pretrained_checkpoint) task.set_reset_heads(config.get("reset_heads", False)) task.set_freeze_trunk(config.get("freeze_trunk", False)) return task
def test_ema_hook(self): cfg = compose_hydra_configuration( [ "config=test/integration_test/quick_eval_in1k_linear.yaml", "config.DATA.TRAIN.DATA_SOURCES=[synthetic]", "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]", "config.DATA.TEST.DATA_SOURCES=[synthetic]", "config.DATA.TEST.LABEL_SOURCES=[synthetic]", "config.DATA.TRAIN.DATA_LIMIT=40", "config.OPTIMIZER.num_epochs=2", "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True", "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True", "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu", ], ) _, config = convert_to_attrdict(cfg) with in_temporary_directory() as checkpoint_folder: # Run a quick_eval_in1k_linear. integration_logs = run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Test that the ema model is saved in the checkpoint. checkpoint = load_checkpoint(checkpoint_path) self.assertTrue( "ema_model" in checkpoint["classy_state_dict"].keys(), msg="ema_model has not been saved to the checkpoint folder.", ) # Test that train_accuracy_list_meter_ema have been logged to metrics.json. metrics = integration_logs.get_accuracies(from_metrics_file=True) self.assertTrue( "train_accuracy_list_meter_ema" in metrics[1], msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.", ) self.assertEqual( len(metrics), 8, "the metrics.json output does not have the appropriate number of entries.", )