Beispiel #1
0
def main(local_rank, c10d_backend, rdzv_init_url, max_world_size, classy_args):
    torch.manual_seed(0)
    set_video_backend(classy_args.video_backend)

    # Loads config, sets up task
    config = load_json(classy_args.config_file)

    task = build_task(config)

    # Load checkpoint, if available
    checkpoint = load_checkpoint(classy_args.checkpoint_folder)
    task.set_checkpoint(checkpoint)

    pretrained_checkpoint = load_checkpoint(classy_args.pretrained_checkpoint_folder)
    if pretrained_checkpoint is not None:
        assert isinstance(
            task, FineTuningTask
        ), "Can only use a pretrained checkpoint for fine tuning tasks"
        task.set_pretrained_checkpoint(pretrained_checkpoint)

    hooks = [
        LossLrMeterLoggingHook(classy_args.log_freq),
        ModelComplexityHook(),
        TimeMetricsHook(),
    ]

    if classy_args.checkpoint_folder != "":
        args_dict = vars(classy_args)
        args_dict["config"] = config
        hooks.append(
            CheckpointHook(
                classy_args.checkpoint_folder,
                args_dict,
                checkpoint_period=classy_args.checkpoint_period,
            )
        )
    if classy_args.profiler:
        hooks.append(ProfilerHook())

    task.set_hooks(hooks)

    assert c10d_backend == Backend.NCCL or c10d_backend == Backend.GLOO
    if c10d_backend == torch.distributed.Backend.NCCL:
        # needed to enable NCCL error handling
        os.environ["NCCL_BLOCKING_WAIT"] = "1"

    coordinator = CoordinatorP2P(
        c10d_backend=c10d_backend,
        init_method=rdzv_init_url,
        max_num_trainers=max_world_size,
        process_group_timeout=60000,
    )
    trainer = ElasticTrainer(
        use_gpu=classy_args.device == "gpu",
        num_dataloader_workers=classy_args.num_workers,
        local_rank=local_rank,
        elastic_coordinator=coordinator,
        input_args={},
    )
    trainer.train(task)
    def test_save_and_load_checkpoint(self):
        checkpoint_dict = {str(i): i * 2 for i in range(1000)}

        # save to the default checkpoint file
        save_checkpoint(self.base_dir, checkpoint_dict)

        # load the checkpoint by using the default file
        loaded_checkpoint = load_checkpoint(self.base_dir)
        self.assertDictEqual(checkpoint_dict, loaded_checkpoint)

        # load the checkpoint by passing the full path
        checkpoint_path = f"{self.base_dir}/{CHECKPOINT_FILE}"
        loaded_checkpoint = load_checkpoint(checkpoint_path)
        self.assertDictEqual(checkpoint_dict, loaded_checkpoint)

        # create a new checkpoint dict
        filename = "my_checkpoint.torch"
        checkpoint_dict = {str(i): i * 3 for i in range(1000)}

        # save the checkpoint to a different file
        save_checkpoint(self.base_dir,
                        checkpoint_dict,
                        checkpoint_file=filename)

        # load the checkpoint by passing the full path
        checkpoint_path = f"{self.base_dir}/{filename}"
        loaded_checkpoint = load_checkpoint(checkpoint_path)
        self.assertDictEqual(checkpoint_dict, loaded_checkpoint)
Beispiel #3
0
    def test_state_checkpointing(self) -> None:
        """
        Test that the state gets checkpointed without any errors, but only on the
        right phase_type and only if the checkpoint directory exists.
        """
        config = get_test_task_config()
        task = build_task(config)
        task.prepare()

        local_variables = {}
        checkpoint_folder = self.base_dir + "/checkpoint_end_test/"
        input_args = {"foo": "bar"}

        # create a checkpoint hook
        checkpoint_hook = CheckpointHook(checkpoint_folder,
                                         input_args,
                                         phase_types=["train"])

        # checkpoint directory doesn't exist
        # call the on start function
        with self.assertRaises(FileNotFoundError):
            checkpoint_hook.on_start(task)
        # call the on end phase function
        with self.assertRaises(AssertionError):
            checkpoint_hook.on_phase_end(task, local_variables)
        # try loading a non-existent checkpoint
        checkpoint = load_checkpoint(checkpoint_folder)
        self.assertIsNone(checkpoint)

        # create checkpoint dir, verify on_start hook runs
        os.mkdir(checkpoint_folder)
        checkpoint_hook.on_start(task)

        # Phase_type is test, expect no checkpoint
        task.train = False
        # call the on end phase function
        checkpoint_hook.on_phase_end(task, local_variables)
        checkpoint = load_checkpoint(checkpoint_folder)
        self.assertIsNone(checkpoint)

        task.train = True
        # call the on end phase function
        checkpoint_hook.on_phase_end(task, local_variables)
        # model should be checkpointed. load and compare
        checkpoint = load_checkpoint(checkpoint_folder)
        self.assertIsNotNone(checkpoint)
        for key in ["input_args", "classy_state_dict"]:
            self.assertIn(key, checkpoint)
        # not testing for equality of classy_state_dict, that is tested in
        # a separate test
        self.assertDictEqual(checkpoint["input_args"], input_args)
Beispiel #4
0
    def test_checkpoint_period(self) -> None:
        """
        Test that the checkpoint_period works as expected.
        """
        config = get_test_task_config()
        task = build_task(config)
        task.prepare()

        local_variables = {}
        checkpoint_folder = self.base_dir + "/checkpoint_end_test/"
        checkpoint_period = 10

        for phase_types in [["train"], ["train", "test"]]:
            # create a checkpoint hook
            checkpoint_hook = CheckpointHook(
                checkpoint_folder,
                {},
                phase_types=phase_types,
                checkpoint_period=checkpoint_period,
            )

            # create checkpoint dir
            os.mkdir(checkpoint_folder)

            # call the on start function
            checkpoint_hook.on_start(task)

            # shouldn't create any checkpoints until there are checkpoint_period
            # phases which are in phase_types
            count = 0
            valid_phase_count = 0
            while valid_phase_count < checkpoint_period - 1:
                task.train = count % 2 == 0
                # call the on end phase function
                checkpoint_hook.on_phase_end(task, local_variables)
                checkpoint = load_checkpoint(checkpoint_folder)
                self.assertIsNone(checkpoint)
                valid_phase_count += 1 if task.phase_type in phase_types else 0
                count += 1

            # create a phase which is in phase_types
            task.train = True
            # call the on end phase function
            checkpoint_hook.on_phase_end(task, local_variables)
            # model should be checkpointed. load and compare
            checkpoint = load_checkpoint(checkpoint_folder)
            self.assertIsNotNone(checkpoint)
            # delete the checkpoint dir
            shutil.rmtree(checkpoint_folder)
def main(args, config):
    # Global flags
    torch.manual_seed(0)
    set_image_backend(args.image_backend)
    set_video_backend(args.video_backend)

    task = build_task(config)

    # Load checkpoint, if available.
    checkpoint = load_checkpoint(args.checkpoint_load_path)
    task.set_checkpoint(checkpoint)

    # Load a checkpoint contraining a pre-trained model. This is how we
    # implement fine-tuning of existing models.
    pretrained_checkpoint = load_checkpoint(args.pretrained_checkpoint_path)
    if pretrained_checkpoint is not None:
        assert isinstance(
            task, FineTuningTask
        ), "Can only use a pretrained checkpoint for fine tuning tasks"
        task.set_pretrained_checkpoint(pretrained_checkpoint)

    # Configure hooks to do tensorboard logging, checkpoints and so on
    task.set_hooks(configure_hooks(args, config))

    use_gpu = None
    if args.device is not None:
        use_gpu = args.device == "gpu"
        assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable"

    # LocalTrainer is used for a single node. DistributedTrainer will setup
    # training to use PyTorch's DistributedDataParallel.
    trainer_class = {
        "none": LocalTrainer,
        "ddp": DistributedTrainer
    }[args.distributed_backend]

    trainer = trainer_class(use_gpu=use_gpu,
                            num_dataloader_workers=args.num_workers)

    logging.info(f"Starting training on rank {get_rank()} worker. "
                 f"World size is {get_world_size()}")
    # That's it! When this call returns, training is done.
    trainer.train(task)

    output_folder = Path(args.checkpoint_folder).resolve()
    logging.info("Training successful!")
    logging.info(
        f'Results of this training run are available at: "{output_folder}"')
    def test_final_train_checkpoint(self):
        """Test that a train phase checkpoint with a where of 1.0 can be loaded"""

        config = get_fast_test_task_config()
        task = build_task(config).set_hooks(
            [CheckpointHook(self.base_dir, {}, phase_types=["train"])])
        task_2 = build_task(config)

        use_gpu = torch.cuda.is_available()

        trainer = LocalTrainer(use_gpu=use_gpu)
        trainer.train(task)

        # load the final train checkpoint
        checkpoint = load_checkpoint(self.base_dir)

        # make sure fetching the where raises an exception, which means that
        # where is >= 1.0
        with self.assertRaises(Exception):
            task.where

        # set task_2's state as task's final train checkpoint
        task_2.set_checkpoint(checkpoint)
        task_2.prepare(use_gpu=use_gpu)

        # we should be able to train the task
        trainer.train(task_2)
    def test_checkpointing(self):
        # make checkpoint directory
        checkpoint_folder = self.base_dir + "/checkpoint/"
        os.mkdir(checkpoint_folder)

        config = get_fast_test_task_config()
        cuda_available = torch.cuda.is_available()
        task = build_task(config)

        task.prepare(use_gpu=cuda_available)

        # create a checkpoint hook
        checkpoint_hook = CheckpointHook(checkpoint_folder, {},
                                         phase_types=["train"])

        # call the on end phase function
        checkpoint_hook.on_phase_end(task)

        # we should be able to train a task using the checkpoint on all available
        # devices
        for use_gpu in {False, cuda_available}:
            # load the checkpoint
            checkpoint = load_checkpoint(checkpoint_folder)

            # create a new task
            task = build_task(config)

            # set the checkpoint
            task.set_checkpoint(checkpoint)

            task.prepare(use_gpu=use_gpu)

            # we should be able to run the trainer using the checkpoint
            trainer = LocalTrainer(use_gpu=use_gpu)
            trainer.train(task)
Beispiel #8
0
    def test_from_checkpoint(self):
        config = get_test_task_config()
        for use_head in [True, False]:
            config["model"] = self.get_model_config(use_head)
            task = build_task(config)
            task.prepare()

            checkpoint_folder = f"{self.base_dir}/{use_head}/"
            input_args = {"config": config}

            # Simulate training by setting the model parameters to zero
            for param in task.model.parameters():
                param.data.zero_()

            checkpoint_hook = CheckpointHook(
                checkpoint_folder, input_args, phase_types=["train"]
            )

            # Create checkpoint dir, save checkpoint
            os.mkdir(checkpoint_folder)
            checkpoint_hook.on_start(task)

            task.train = True
            checkpoint_hook.on_phase_end(task)

            # Model should be checkpointed. load and compare
            checkpoint = load_checkpoint(checkpoint_folder)

            model = ClassyModel.from_checkpoint(checkpoint)
            self.assertTrue(isinstance(model, MyTestModel))

            # All parameters must be zero
            for param in model.parameters():
                self.assertTrue(torch.all(param.data == 0))
Beispiel #9
0
def build_retrieval_model(cfg):
    """
    Builds the model on 1-gpu and initializes from the weight.
    """
    logging.info("Building model....")
    model = build_model(cfg.MODEL, cfg.OPTIMIZER)
    if g_pathmgr.exists(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE):
        init_weights_path = cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE
        logging.info(f"Initializing model from: {init_weights_path}")
        weights = load_checkpoint(init_weights_path,
                                  device=torch.device("cuda"))
        skip_layers = cfg.MODEL.WEIGHTS_INIT.get("SKIP_LAYERS", [])
        replace_prefix = cfg.MODEL.WEIGHTS_INIT.get("REMOVE_PREFIX", None)
        append_prefix = cfg.MODEL.WEIGHTS_INIT.get("APPEND_PREFIX", None)
        state_dict_key_name = cfg.MODEL.WEIGHTS_INIT.get(
            "STATE_DICT_KEY_NAME", None)

        init_model_from_consolidated_weights(
            cfg,
            model,
            weights,
            state_dict_key_name=state_dict_key_name,
            skip_layers=skip_layers,
            replace_prefix=replace_prefix,
            append_prefix=append_prefix,
        )
    else:
        # We only throw the warning if not weights file is provided. We want to
        # benchmark the random initialization model too and hence support that.
        logging.warning("Model is randomly initialized....")
    logging.info(f"Model is:\n {model}")
    return model
Beispiel #10
0
 def load_and_broadcast_checkpoint(
     cls, checkpoint_folder: str, checkpoint_path: str, device
 ):
     """
     Load the checkpoint at the provided path, dealing with the
     potential indirection due to the notion of sharded checkpoint
     """
     checkpoint = load_and_broadcast_checkpoint(checkpoint_path, device)
     if cls._is_shard_aggregator_checkpoint(checkpoint):
         _, global_rank = get_machine_local_and_dist_rank()
         shard_name = checkpoint["shards"][global_rank]
         shard_path = os.path.join(checkpoint_folder, shard_name)
         checkpoint = load_checkpoint(shard_path, device)
     return checkpoint
Beispiel #11
0
    def from_config(cls, config: Dict[str, Any]) -> "FineTuningTask":
        """Instantiates a FineTuningTask from a configuration.

        Args:
            config: A configuration for a FineTuningTask.
                See :func:`__init__` for parameters expected in the config.

        Returns:
            A FineTuningTask instance.
        """
        task = super().from_config(config)

        pretrained_checkpoint = load_checkpoint(config.get("pretrained_checkpoint"))

        if pretrained_checkpoint is not None:
            task.set_pretrained_checkpoint(pretrained_checkpoint)

        task.set_reset_heads(config.get("reset_heads", False))
        task.set_freeze_trunk(config.get("freeze_trunk", False))
        return task
Beispiel #12
0
    def test_ema_hook(self):
        cfg = compose_hydra_configuration(
            [
                "config=test/integration_test/quick_eval_in1k_linear.yaml",
                "config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
                "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]",
                "config.DATA.TEST.DATA_SOURCES=[synthetic]",
                "config.DATA.TEST.LABEL_SOURCES=[synthetic]",
                "config.DATA.TRAIN.DATA_LIMIT=40",
                "config.OPTIMIZER.num_epochs=2",
                "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True",
                "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True",
                "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu",
            ],
        )
        _, config = convert_to_attrdict(cfg)

        with in_temporary_directory() as checkpoint_folder:
            # Run a quick_eval_in1k_linear.
            integration_logs = run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch")

            # Test that the ema model is saved in the checkpoint.
            checkpoint = load_checkpoint(checkpoint_path)
            self.assertTrue(
                "ema_model" in checkpoint["classy_state_dict"].keys(),
                msg="ema_model has not been saved to the checkpoint folder.",
            )

            # Test that train_accuracy_list_meter_ema have been logged to metrics.json.
            metrics = integration_logs.get_accuracies(from_metrics_file=True)
            self.assertTrue(
                "train_accuracy_list_meter_ema" in metrics[1],
                msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.",
            )

            self.assertEqual(
                len(metrics),
                8,
                "the metrics.json output does not have the appropriate number of entries.",
            )