Esempio n. 1
0
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")

            # Load the checkpoint and perform a linear evaluation on it
            losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir,
                                             "checkpoint_conso.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(losses))
            print(losses)
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights (3 different ways)
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
            CheckpointFormatConverter.consolidated_to_sliced_checkpoint(
                "checkpoint_conso.torch", "checkpoint_sliced_2.torch")

            # Load the sharded checkpoint and perform a inear evaluation on it
            ref_losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(ref_losses))

            # Then check that the results are the same for the other kind of
            # checkpoints after consolidation has taken place
            for checkpoint_name in [
                    "checkpoint_conso.torch",
                    "checkpoint_sliced.torch",
                    "checkpoint_sliced_2.torch",
            ]:
                losses = self.run_linear_eval(
                    checkpoint_path=os.path.join(pretrain_dir,
                                                 checkpoint_name),
                    with_fsdp=True,
                    with_mixed_precision=False,
                    auto_wrap_threshold=0,
                )
                self.assertEqual(8, len(losses))
                self.assertAlmostEqual(
                    losses[0],
                    ref_losses[0],
                    places=4,
                    msg=f"Failed for {checkpoint_name}",
                )
Esempio n. 3
0
def convert_checkpoint(input_path: str, output_path: str, output_type: str):
    assert g_pathmgr.exists(
        input_path), f"Checkpoint input path: {input_path} not found."

    # Make the output directory if it doesn't exist.
    makedir(os.path.split(output_path)[0])

    setup_logging(__name__)
    if output_type == CheckpointType.consolidated.name:
        CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
            input_path, output_path)
    elif output_type == CheckpointType.sliced.name:
        CheckpointFormatConverter.to_sliced_checkpoint(input_path, output_path)
    shutdown_logging()
Esempio n. 4
0
    def test_benchmarking_from_a_consolidated_checkpoint_2(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in DDP mode and convert to a consolidated checkpoint
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            sharded_checkpoint_path = os.path.join(checkpoint_folder,
                                                   "checkpoint.torch")
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                sharded_checkpoint_path, checkpoint_path)

            # Now, run both DDP and FSDP linear evaluation and compare the traces
            ddp_losses, ddp_accuracies = self.run_benchmarking(checkpoint_path,
                                                               with_fsdp=False)
            fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertEqual(ddp_losses, fsdp_losses)
            self.assertEqual(ddp_accuracies, fsdp_accuracies)
Esempio n. 5
0
    def test_benchmarking_with_checkpoint_resharding(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoint
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # List the files inside the current working directory
            # to later test what files have been created
            files_before_conversion = set(os.listdir(checkpoint_folder))

            # Transform the sharded checkpoint to a consolidated checkpoint
            eval_checkpoint_path_1 = os.path.join(checkpoint_folder,
                                                  "checkpoint_eval_1.torch")
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                input_checkpoint_path=checkpoint_path,
                output_checkpoint_path=eval_checkpoint_path_1,
            )

            # Transform the sharded checkpoint to a sliced checkpoint
            eval_checkpoint_path_2 = os.path.join(checkpoint_folder,
                                                  "checkpoint_eval_2.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                input_checkpoint_path=checkpoint_path,
                output_checkpoint_path=eval_checkpoint_path_2,
            )

            # Verify the content of the directory after checkpoint conversion
            files_after_conversion = set(os.listdir(checkpoint_folder))
            new_files = files_after_conversion - files_before_conversion
            expected_new_files = {
                "checkpoint_eval_1.torch",
                "checkpoint_eval_2.torch",
                "checkpoint_eval_2_layers",
            }
            self.assertEqual(
                new_files,
                expected_new_files,
                "checkpoint 2 slices should be packaged in a directory",
            )

            # Run a benchmark in FSDP mode and record the losses and accuracies
            eval_losses, eval_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertGreater(len(eval_losses), 0)
            self.assertEqual(4, len(eval_accuracies))

            # Check that these losses and accuracies are the same with the
            # consolidated and sliced checkpoints
            for eval_checkpoint in [
                    eval_checkpoint_path_1, eval_checkpoint_path_2
            ]:
                fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                    eval_checkpoint, with_fsdp=True)
                self.assertEqual(fsdp_losses, eval_losses)
                self.assertEqual(fsdp_accuracies, eval_accuracies)

            # Check that the consolidated and sliced checkpoints, contrary to
            # the sharded checkpoint, can be used with a different number of GPUs
            for eval_checkpoint in [
                    eval_checkpoint_path_1, eval_checkpoint_path_2
            ]:
                fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                    eval_checkpoint, with_fsdp=True, num_gpu=1)
                self.assertGreater(len(fsdp_losses), 0)
                self.assertEqual(len(fsdp_accuracies), 4)
    def _worker(gpu_id: int, sync_file: str, world_size: int):
        torch.manual_seed(0)
        os.environ["RANK"] = str(gpu_id)
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.backends.cudnn.deterministic = True

        config = TestCheckpointConversion._create_fsdp_model_config(
            with_fsdp=True)
        model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG)
        optimizer = optim.SGD(model.parameters(), lr=1e-4)

        # Fake inputs
        num_iterations = 5
        batch_size = 3
        torch.manual_seed(gpu_id)
        fake_inputs = torch.randn(size=(num_iterations, batch_size, 3, 96, 96))
        fake_targets = torch.randn(size=(num_iterations, batch_size))

        # Fake training loop
        criterion = nn.MSELoss()
        for iteration in range(num_iterations):
            fake_input = fake_inputs[iteration].cuda(gpu_id)
            fake_target = fake_targets[iteration].cuda(gpu_id)
            output1, output2 = model(fake_input)[0]
            loss = criterion(output1.sum(axis=-1), fake_target) + criterion(
                output2.sum(axis=-1), fake_target)
            if gpu_id == 0:
                print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Save a bunch of checkpoint, one by shard
        checkpoint_writer = CheckpointWriter(
            checkpoint_folder=".",
            is_final_train_phase=True,
            mode="iteration",
            mode_num=0,
            backend="disk",
        )
        content = {
            "classy_state_dict": {
                "base_model": {
                    "model": {
                        "trunk": model.trunk.local_state_dict()
                    },
                    "meta": {
                        "trunk": model.trunk.local_metadata_dict()
                    },
                }
            }
        }
        checkpoint_writer.save_sharded_checkpoint(content,
                                                  shard_rank=gpu_id,
                                                  world_size=world_size)
        dist.barrier()
        print(os.listdir("."))

        # Convert the checkpoint to consolidated and sliced checkpoints
        if gpu_id == 0:
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
        dist.barrier()
        print(os.listdir("."))

        # Now create models initialized from the previous checkpoint and compare them
        fake_test_input = torch.randn(size=(1, 3, 96, 96)).cuda(gpu_id)

        shard_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint.torch", device=torch.device("cpu"))
        shard_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        shard_model = fsdp_wrapper(shard_model, **config.MODEL.FSDP_CONFIG)
        shard_model.init_model_from_weights_params_file(config, shard_cp)

        conso_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_conso.torch", device=torch.device("cpu"))
        conso_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        conso_model = fsdp_wrapper(conso_model, **config.MODEL.FSDP_CONFIG)
        conso_model.init_model_from_weights_params_file(config, conso_cp)

        slice_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_sliced.torch", device=torch.device("cpu"))
        slice_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        slice_model = fsdp_wrapper(slice_model, **config.MODEL.FSDP_CONFIG)
        slice_model.init_model_from_weights_params_file(config, slice_cp)

        # Verifying that the models are equivalent
        if gpu_id == 0:
            slice_state_dict = slice_model.local_state_dict()
            conso_state_dict = conso_model.local_state_dict()
            assert set(slice_state_dict.keys()) == set(conso_state_dict.keys())
            for k in slice_state_dict.keys():
                slice_val = slice_state_dict[k]
                conso_val = conso_state_dict[k]
                assert torch.allclose(
                    slice_val, conso_val
                ), f"Difference for key {k}: {slice_val} VS {conso_val}"
        dist.barrier()

        with torch.no_grad():
            ref_out = model.trunk(fake_test_input)[0]
            shard_out = shard_model.trunk(fake_test_input)[0]
            conso_out = conso_model.trunk(fake_test_input)[0]
            slice_out = slice_model.trunk(fake_test_input)[0]
            assert torch.allclose(
                ref_out, shard_out), f"{ref_out.sum()} vs {shard_out.sum()}"
            assert torch.allclose(
                ref_out, conso_out), f"{ref_out.sum()} vs {conso_out.sum()}"
            assert torch.allclose(
                ref_out, slice_out), f"{ref_out.sum()} vs {slice_out.sum()}"