Ejemplo n.º 1
0
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")

            # Load the checkpoint and perform a linear evaluation on it
            losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir,
                                             "checkpoint_conso.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(losses))
            print(losses)
Ejemplo n.º 2
0
    def run_cluster_assignment(self, with_fsdp: bool):
        with in_temporary_directory() as pretrain_dir:

            # Pre-train a SwAV model in order to get some weights
            pretrain_config = self._create_pretraining_config(with_fsdp=with_fsdp)
            run_integration_test(pretrain_config)

            # Extract the cluster assignments of each sample
            with in_temporary_directory() as extract_dir:
                extract_config = self._create_extract_cluster_config(
                    with_fsdp=with_fsdp,
                    checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                )
                run_integration_test(extract_config, engine_name="extract_cluster")
                self.assertIn("cluster_assignments.torch", os.listdir(extract_dir))
                shutil.move(
                    src=os.path.join(extract_dir, "cluster_assignments.torch"),
                    dst=os.path.join(pretrain_dir, "cluster_assignments.torch"),
                )

            # Load the cluster assignments and check their structure
            assignments = ClusterAssignmentLoader.load_cluster_assigment(
                "cluster_assignments.torch"
            )
            self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"]))
            self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
Ejemplo n.º 3
0
    def run_benchmarking_preemption_test(
        self,
        checkpoint_path: str,
        with_fsdp: bool,
        with_eval_mlp: bool,
        num_gpu: int = 2,
    ):
        with in_temporary_directory() as temp_dir:
            config = self._create_benchmark_config(
                checkpoint_path,
                with_fsdp=with_fsdp,
                with_eval_mlp=with_eval_mlp,
                num_gpu=num_gpu,
            )
            config.CHECKPOINT.DIR = temp_dir
            results = run_integration_test(config)
            initial_losses = results.get_losses()

            results.clean_final_checkpoint()
            results.clean_logs()

            results = run_integration_test(config)
            restart_losses = results.get_losses()

            print("INITIAL:", initial_losses)
            print("RESTART:", restart_losses)

            self.assertEqual(initial_losses[5:], restart_losses)
Ejemplo n.º 4
0
    def test_fine_tuning_end_to_end_fsdp(self):
        with in_temporary_directory() as pretrain_dir:
            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config(
                with_fsdp=True, fsdp_flatten_parameters=True)
            run_integration_test(pretrain_config)
            sharded_checkpoint_path = os.path.join(pretrain_dir,
                                                   "checkpoint.torch")
            sliced_checkpoint_path = os.path.join(pretrain_dir, "sliced.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                input_checkpoint_path=sharded_checkpoint_path,
                output_checkpoint_path=sliced_checkpoint_path,
            )

            # Create a separate directly in which to run the fine-tuning
            with in_temporary_directory():
                finetune_config = self._create_finetuning_config(
                    sliced_checkpoint_path,
                    construct_single_param_group_only=False,
                    regularize_bias=False,
                    with_fsdp=True,
                    fsdp_flatten_parameters=False,
                )
                result = run_integration_test(finetune_config)
                accuracies = result.get_accuracies(from_metrics_file=True)
                self.assertEqual(4, len(accuracies))
Ejemplo n.º 5
0
    def test_augly_transforms(self):
        cfg = compose_hydra_configuration([
            "config=test/cpu_test/test_cpu_resnet_simclr.yaml",
            "+config/test/transforms=augly_transforms_example",
        ], )
        _, config = convert_to_attrdict(cfg)

        with in_temporary_directory() as _:
            # Test that the training runs with an augly transformation.
            run_integration_test(config)
Ejemplo n.º 6
0
 def test_legacy_profiler(self):
     with in_temporary_directory() as output_dir:
         config = self._create_config(force_legacy_profiler=True)
         run_integration_test(config)
         files = set(os.listdir(output_dir))
         print(files)
         self.assertIn("cuda_time_rank0.txt", files)
         self.assertIn("cuda_memory_usage_rank0.txt", files)
         self.assertIn("cpu_time_rank0.txt", files)
         self.assertIn("profiler_chrome_trace_rank0.json", files)
Ejemplo n.º 7
0
    def test_benchmarking_from_sharded_checkpoint(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoing
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # Verify that FSDP can load the checkpoint and run a benchmark on it
            fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertGreaterEqual(len(fsdp_losses), 0)
            self.assertEqual(4, len(fsdp_accuracies))
Ejemplo n.º 8
0
    def run_cluster_assignment(self, with_fsdp: bool):
        with in_temporary_directory() as pretrain_dir:

            # Pre-train a SwAV model in order to get some weights
            pretrain_config = self._create_pretraining_config(
                with_fsdp=with_fsdp)
            run_integration_test(pretrain_config)

            # Extract the cluster assignments of each sample
            with in_temporary_directory() as extract_dir:
                extract_config = self._create_extract_cluster_config(
                    with_fsdp=with_fsdp,
                    checkpoint_path=os.path.join(pretrain_dir,
                                                 "checkpoint.torch"),
                )
                extract_config.EXTRACT_FEATURES.CHUNK_THRESHOLD = 10
                run_integration_test(extract_config,
                                     engine_name="extract_cluster")
                extraction_outputs = os.listdir(extract_dir)

                # Check that the cluster assignments are computed in both
                # compact format and dataset disk_filelist format
                self.assertIn("cluster_assignments.torch", extraction_outputs)
                self.assertIn("train_images.npy", extraction_outputs)
                self.assertIn("train_labels.npy", extraction_outputs)
                self.assertIn("test_images.npy", extraction_outputs)
                self.assertIn("test_labels.npy", extraction_outputs)

                # Check that the soft assignments (on prototypes) are exported
                for rank in range(2):
                    for chunk in range(2):
                        file_name = f"rank{rank}_chunk{chunk}_train_heads_protos.npy"
                        self.assertIn(file_name, extraction_outputs)
                        self.assertEqual(np.load(file_name).shape[1], 3000)
                    file_name = f"rank{rank}_chunk0_test_heads_protos.npy"
                    self.assertIn(file_name, extraction_outputs)
                    self.assertEqual(np.load(file_name).shape[1], 3000)

                # Copy the cluster assignments
                shutil.move(
                    src=os.path.join(extract_dir, "cluster_assignments.torch"),
                    dst=os.path.join(pretrain_dir,
                                     "cluster_assignments.torch"),
                )

            # Load the cluster assignments and check their structure
            assignments = ClusterAssignmentLoader.load_cluster_assigment(
                "cluster_assignments.torch")
            self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"]))
            self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
Ejemplo n.º 9
0
    def test_fsdp_integration_with_linear_eval(self):
        with in_temporary_directory() as pretrain_dir:

            # Start pre-training
            config = self._create_pretraining_config(
                with_fsdp=True,
                with_activation_checkpointing=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            run_integration_test(config)

            # Consolidate the weights (3 different ways)
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
            CheckpointFormatConverter.consolidated_to_sliced_checkpoint(
                "checkpoint_conso.torch", "checkpoint_sliced_2.torch")

            # Load the sharded checkpoint and perform a inear evaluation on it
            ref_losses = self.run_linear_eval(
                checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                with_fsdp=True,
                with_mixed_precision=False,
                auto_wrap_threshold=0,
            )
            self.assertEqual(8, len(ref_losses))

            # Then check that the results are the same for the other kind of
            # checkpoints after consolidation has taken place
            for checkpoint_name in [
                    "checkpoint_conso.torch",
                    "checkpoint_sliced.torch",
                    "checkpoint_sliced_2.torch",
            ]:
                losses = self.run_linear_eval(
                    checkpoint_path=os.path.join(pretrain_dir,
                                                 checkpoint_name),
                    with_fsdp=True,
                    with_mixed_precision=False,
                    auto_wrap_threshold=0,
                )
                self.assertEqual(8, len(losses))
                self.assertAlmostEqual(
                    losses[0],
                    ref_losses[0],
                    places=4,
                    msg=f"Failed for {checkpoint_name}",
                )
Ejemplo n.º 10
0
    def test_benchmarking_from_sharded_checkpoint_with_preemption(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoing
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch")

            # Verify that FSDP can load the checkpoint and run a benchmark on it
            # and that it can restart from a preemption of the benchmark
            self.run_benchmarking_preemption_test(
                checkpoint_path, with_fsdp=True, with_eval_mlp=True
            )
            self.run_benchmarking_preemption_test(
                checkpoint_path, with_fsdp=True, with_eval_mlp=False
            )
Ejemplo n.º 11
0
    def run_preemption_test(self, config: AttrDict, compare_losses: bool = True):
        initial_result = run_integration_test(config)
        initial_iters, initial_losses = initial_result.get_losses_with_iterations()

        initial_result.clean_final_checkpoint()
        initial_result.clean_logs()

        restart_result = run_integration_test(config)
        restart_iters, restart_losses = restart_result.get_losses_with_iterations()

        print("INITIAL:", initial_iters, initial_losses)
        print("RESTART:", restart_iters, restart_losses)
        self.assertEqual(initial_iters[-len(restart_iters) :], restart_iters)
        if compare_losses:
            self.assertEqual(initial_losses[-len(restart_losses) :], restart_losses)
Ejemplo n.º 12
0
    def test_benchmarking_from_a_consolidated_checkpoint(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in DDP mode and save a consolidated checkpoint
            config = self._create_pretraining_config(with_fsdp=False)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # Now, run both DDP and FSDP linear evaluation and compare the traces
            ddp_losses, ddp_accuracies = self.run_benchmarking(checkpoint_path,
                                                               with_fsdp=False)
            fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertEqual(ddp_losses, fsdp_losses)
            self.assertEqual(ddp_accuracies, fsdp_accuracies)
Ejemplo n.º 13
0
    def test_extract_cluster_assignment_ddp(self):
        with in_temporary_directory() as pretrain_dir:

            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config()
            run_integration_test(pretrain_config)

            # Create a directory to contain the extracted features
            with in_temporary_directory() as extract_dir:

                # Run the extract engine in a separate directory to check that
                # it is correctly able to output the feature in a another dir
                with in_temporary_directory():
                    extract_config = self._create_extract_features_config(
                        checkpoint_path=os.path.join(pretrain_dir,
                                                     "checkpoint.torch"))
                    extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                    run_integration_test(extract_config,
                                         engine_name="extract_features")

                # Check the content of the directory containing the extracted dirs
                folder_content = os.listdir(extract_dir)
                print(folder_content)
                for rank in [0, 1]:
                    for chunk in range(5):
                        for file in [
                                f"rank{rank}_chunk{chunk}_train_heads_features.npy",
                                f"rank{rank}_chunk{chunk}_train_heads_inds.npy",
                                f"rank{rank}_chunk{chunk}_train_heads_targets.npy",
                        ]:
                            self.assertIn(file, folder_content)

                # Verify that we can merge the features back (train split)
                train_feat = merge_features(extract_dir, "train", "heads")
                print(train_feat)
                self.assertEqual(train_feat["features"].shape,
                                 torch.Size([40, 128]))
                self.assertEqual(train_feat["targets"].shape,
                                 torch.Size([40, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([40]))

                # Verify that we can merge the features back (test split)
                test_feat = merge_features(extract_dir, "test", "heads")
                self.assertEqual(test_feat["features"].shape,
                                 torch.Size([20, 128]))
                self.assertEqual(test_feat["targets"].shape,
                                 torch.Size([20, 1]))
                self.assertEqual(test_feat["inds"].shape, torch.Size([20]))
Ejemplo n.º 14
0
 def run_config(self, config, with_memory: bool = False):
     with in_temporary_directory():
         result = run_integration_test(config)
         losses = result.get_losses()
         if with_memory:
             return losses, result.get_peak_memory()
         return losses
Ejemplo n.º 15
0
    def test_fine_tuning_end_to_end(self):
        with in_temporary_directory() as pretrain_dir:
            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config()
            run_integration_test(pretrain_config)
            checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch")

            # Create a separate directly in which to run the fine-tuning
            with in_temporary_directory():
                finetune_config = self._create_finetuning_config(
                    checkpoint_path,
                    construct_single_param_group_only=False,
                    regularize_bias=False,
                )
                result = run_integration_test(finetune_config)
                accuracies = result.get_accuracies(from_metrics_file=True)
                self.assertEqual(4, len(accuracies))
Ejemplo n.º 16
0
 def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool, num_gpu: int = 2):
     with in_temporary_directory() as temp_dir:
         config = self._create_benchmark_config(
             checkpoint_path, with_fsdp=with_fsdp, num_gpu=num_gpu
         )
         config.CHECKPOINT.DIR = temp_dir
         results = run_integration_test(config)
         return results.get_losses(), results.get_accuracies(from_metrics_file=True)
Ejemplo n.º 17
0
 def test_regnet_10b_swav_pretraining(self):
     with in_temporary_directory():
         config = self._create_10B_pretrain_config(
             num_gpus=8, num_steps=2, batch_size=4
         )
         results = run_integration_test(config)
         losses = results.get_losses()
         print(losses)
         self.assertEqual(len(losses), 2)
Ejemplo n.º 18
0
    def test_prehemption_during_training(self):
        with in_temporary_directory() as temp_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=False, gpu_count=2)
            result = run_integration_test(config)
            losses_before = result.get_losses()

            temp_dir_content = os.listdir(temp_dir)
            self.assertIn("model_final_checkpoint_phase3.torch",
                          temp_dir_content)
            os.remove("model_final_checkpoint_phase3.torch")
            os.remove("checkpoint.torch")
            os.remove("log.txt")

            result = run_integration_test(config)
            losses_after = result.get_losses()
            print(losses_before)
            print(losses_after)
            self.assertAlmostEqual(losses_after[-1],
                                   losses_before[-1],
                                   places=5)
Ejemplo n.º 19
0
 def test_regnet_10b_evaluation(self):
     with in_temporary_directory():
         cp_path = "/checkpoint/qduval/vissl/seer/regnet10B_sliced/model_iteration124500_sliced.torch"
         config = self._create_10B_evaluation_config(
             num_gpus=8,
             num_steps=2,
             batch_size=4,
             path_to_sliced_checkpoint=cp_path)
         results = run_integration_test(config)
         losses = results.get_losses()
         print(losses)
         self.assertGreater(len(losses), 0)
Ejemplo n.º 20
0
 def run_pretraining(
     self,
     with_fsdp: bool,
     with_activation_checkpointing: bool,
     with_mixed_precision: bool,
 ):
     with in_temporary_directory():
         config = self._create_pretraining_config(
             with_fsdp=with_fsdp,
             with_activation_checkpointing=with_activation_checkpointing,
             with_mixed_precision=with_mixed_precision,
         )
         result = run_integration_test(config)
         return result.get_losses()
Ejemplo n.º 21
0
    def test_dino_xcit_prehemption(self):
        with in_temporary_directory() as temp_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=False, gpu_count=2
            )

            # For deterministic computing
            config.MODEL.TRUNK.XCIT.DROP_PATH_RATE = 0.0

            result = run_integration_test(config)
            losses_before = result.get_losses()

            temp_dir_content = os.listdir(temp_dir)
            self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content)
            os.remove("model_final_checkpoint_phase3.torch")
            os.remove("checkpoint.torch")
            os.remove("log.txt")

            result = run_integration_test(config)
            losses_after = result.get_losses()
            print(losses_before)
            print(losses_after)
            self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=4)
Ejemplo n.º 22
0
    def test_pretraining_and_evaluation(self):
        with in_temporary_directory() as pretrain_dir:
            config = self._create_dino_pretraining_config(
                with_mixed_precision=True, gpu_count=2, num_epochs=1)
            result = run_integration_test(config)
            ddp_losses = result.get_losses()
            self.assertGreater(len(ddp_losses), 0)

            eval_config = self._create_dino_linear_eval_config(
                checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"),
                gpu_count=2,
            )
            eval_losses = self.run_config(eval_config)
            print(eval_losses)
Ejemplo n.º 23
0
 def run_linear_eval(
     self,
     checkpoint_path: str,
     with_fsdp: bool,
     with_mixed_precision: bool,
     auto_wrap_threshold: int = 0,
 ):
     with in_temporary_directory():
         config = self._create_linear_evaluation_config(
             with_fsdp=with_fsdp,
             with_mixed_precision=with_mixed_precision,
             auto_wrap_threshold=auto_wrap_threshold,
         )
         config.MODEL.WEIGHTS_INIT.PARAMS_FILE = checkpoint_path
         result = run_integration_test(config)
         return result.get_losses()
Ejemplo n.º 24
0
 def run_pretraining(
     self,
     with_fsdp: bool,
     with_activation_checkpointing: bool,
     with_mixed_precision: bool,
     auto_wrap_threshold: int = 0,
     force_sync_all_gather: bool = False,
 ):
     with in_temporary_directory():
         config = self._create_pretraining_config(
             with_fsdp=with_fsdp,
             with_activation_checkpointing=with_activation_checkpointing,
             with_mixed_precision=with_mixed_precision,
             auto_wrap_threshold=auto_wrap_threshold,
             force_sync_all_gather=force_sync_all_gather,
         )
         result = run_integration_test(config)
         return result.get_losses()
Ejemplo n.º 25
0
    def test_ema_hook(self):
        cfg = compose_hydra_configuration(
            [
                "config=test/integration_test/quick_eval_in1k_linear.yaml",
                "config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
                "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]",
                "config.DATA.TEST.DATA_SOURCES=[synthetic]",
                "config.DATA.TEST.LABEL_SOURCES=[synthetic]",
                "config.DATA.TRAIN.DATA_LIMIT=40",
                "config.OPTIMIZER.num_epochs=2",
                "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True",
                "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True",
                "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu",
            ],
        )
        _, config = convert_to_attrdict(cfg)

        with in_temporary_directory() as checkpoint_folder:
            # Run a quick_eval_in1k_linear.
            integration_logs = run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch")

            # Test that the ema model is saved in the checkpoint.
            checkpoint = load_checkpoint(checkpoint_path)
            self.assertTrue(
                "ema_model" in checkpoint["classy_state_dict"].keys(),
                msg="ema_model has not been saved to the checkpoint folder.",
            )

            # Test that train_accuracy_list_meter_ema have been logged to metrics.json.
            metrics = integration_logs.get_accuracies(from_metrics_file=True)
            self.assertTrue(
                "train_accuracy_list_meter_ema" in metrics[1],
                msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.",
            )

            self.assertEqual(
                len(metrics),
                8,
                "the metrics.json output does not have the appropriate number of entries.",
            )
Ejemplo n.º 26
0
 def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool):
     with in_temporary_directory():
         config = self._create_benchmark_config(checkpoint_path,
                                                with_fsdp=with_fsdp)
         results = run_integration_test(config)
         return results.get_losses(), results.get_accuracies()
Ejemplo n.º 27
0
    def test_extract_cluster_assignment_ddp(self):
        with in_temporary_directory() as pretrain_dir:

            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config()
            run_integration_test(pretrain_config)

            # Create a directory to contain the extracted features
            with in_temporary_directory() as extract_dir:

                # Run the extract engine in a separate directory to check that
                # it is correctly able to output the feature in a another dir
                with in_temporary_directory():
                    extract_config = self._create_extract_features_config_head(
                        checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch")
                    )
                    extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                    run_integration_test(extract_config, engine_name="extract_features")

                # Check the content of the directory containing the extracted dirs
                folder_content = os.listdir(extract_dir)
                print(folder_content)
                for rank in [0, 1]:
                    for chunk in range(5):
                        for file in [
                            f"rank{rank}_chunk{chunk}_train_heads_features.npy",
                            f"rank{rank}_chunk{chunk}_train_heads_inds.npy",
                            f"rank{rank}_chunk{chunk}_train_heads_targets.npy",
                        ]:
                            self.assertIn(file, folder_content)

                # Verify that we can merge the features back (train split)
                train_feat = merge_features(extract_dir, "train", "heads")
                print(train_feat)
                self.assertEqual(train_feat["features"].shape, torch.Size([40, 128]))
                self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([40]))

                # Verify that we can merge the features back (test split)
                test_feat = merge_features(extract_dir, "test", "heads")
                self.assertEqual(test_feat["features"].shape, torch.Size([20, 128]))
                self.assertEqual(test_feat["targets"].shape, torch.Size([20, 1]))
                self.assertEqual(test_feat["inds"].shape, torch.Size([20]))

                # Run the extract engine this time for the features of the trunk
                with in_temporary_directory():
                    extract_config = self._create_extract_features_config_trunk(
                        checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch")
                    )
                    extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                    run_integration_test(extract_config, engine_name="extract_features")

                # Verify that we can merge the features back without flattening them
                train_feat = merge_features(extract_dir, "train", "res5")
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([40, 2048, 2, 2])
                )
                self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([40]))

                # Verify that we can merge the features back without flattening them (second approach)
                train_feat = ExtractedFeaturesLoader.load_features(
                    extract_dir, "train", "res5"
                )
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([40, 2048, 2, 2])
                )

                # Verify that we can merge the features back but flattened
                train_feat = ExtractedFeaturesLoader.load_features(
                    extract_dir, "train", "res5", flatten_features=True
                )
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([40, 2048 * 2 * 2])
                )
                self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([40]))

                # Verify that we can sample the features (unflattened)
                train_feat = ExtractedFeaturesLoader.sample_features(
                    input_dir=extract_dir,
                    split="train",
                    layer="res5",
                    num_samples=10,
                    seed=0,
                )
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([10, 2048, 2, 2])
                )
                self.assertEqual(train_feat["targets"].shape, torch.Size([10, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([10]))

                # Verify that we can sample the features (flattened)
                train_feat = ExtractedFeaturesLoader.sample_features(
                    input_dir=extract_dir,
                    split="train",
                    layer="res5",
                    num_samples=10,
                    seed=0,
                    flatten_features=True,
                )
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([10, 2048 * 2 * 2])
                )
                self.assertEqual(train_feat["targets"].shape, torch.Size([10, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([10]))
Ejemplo n.º 28
0
 def run_config(self, config):
     with in_temporary_directory():
         result = run_integration_test(config)
         return result.get_losses()
Ejemplo n.º 29
0
    def test_benchmarking_with_checkpoint_resharding(self):
        with in_temporary_directory() as checkpoint_folder:
            # Run a pre-training in FSDP mode and save a sharded checkpoint
            config = self._create_pretraining_config(with_fsdp=True)
            run_integration_test(config)
            checkpoint_path = os.path.join(checkpoint_folder,
                                           "checkpoint.torch")

            # List the files inside the current working directory
            # to later test what files have been created
            files_before_conversion = set(os.listdir(checkpoint_folder))

            # Transform the sharded checkpoint to a consolidated checkpoint
            eval_checkpoint_path_1 = os.path.join(checkpoint_folder,
                                                  "checkpoint_eval_1.torch")
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                input_checkpoint_path=checkpoint_path,
                output_checkpoint_path=eval_checkpoint_path_1,
            )

            # Transform the sharded checkpoint to a sliced checkpoint
            eval_checkpoint_path_2 = os.path.join(checkpoint_folder,
                                                  "checkpoint_eval_2.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                input_checkpoint_path=checkpoint_path,
                output_checkpoint_path=eval_checkpoint_path_2,
            )

            # Verify the content of the directory after checkpoint conversion
            files_after_conversion = set(os.listdir(checkpoint_folder))
            new_files = files_after_conversion - files_before_conversion
            expected_new_files = {
                "checkpoint_eval_1.torch",
                "checkpoint_eval_2.torch",
                "checkpoint_eval_2_layers",
            }
            self.assertEqual(
                new_files,
                expected_new_files,
                "checkpoint 2 slices should be packaged in a directory",
            )

            # Run a benchmark in FSDP mode and record the losses and accuracies
            eval_losses, eval_accuracies = self.run_benchmarking(
                checkpoint_path, with_fsdp=True)
            self.assertGreater(len(eval_losses), 0)
            self.assertEqual(4, len(eval_accuracies))

            # Check that these losses and accuracies are the same with the
            # consolidated and sliced checkpoints
            for eval_checkpoint in [
                    eval_checkpoint_path_1, eval_checkpoint_path_2
            ]:
                fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                    eval_checkpoint, with_fsdp=True)
                self.assertEqual(fsdp_losses, eval_losses)
                self.assertEqual(fsdp_accuracies, eval_accuracies)

            # Check that the consolidated and sliced checkpoints, contrary to
            # the sharded checkpoint, can be used with a different number of GPUs
            for eval_checkpoint in [
                    eval_checkpoint_path_1, eval_checkpoint_path_2
            ]:
                fsdp_losses, fsdp_accuracies = self.run_benchmarking(
                    eval_checkpoint, with_fsdp=True, num_gpu=1)
                self.assertGreater(len(fsdp_losses), 0)
                self.assertEqual(len(fsdp_accuracies), 4)
Ejemplo n.º 30
0
    def test_knn_fsdp(self):
        with in_temporary_directory() as pretrain_dir:

            # Run a pre-training to have some weights to being with
            pretrain_config = self._create_pretraining_config(with_fsdp=True)
            results = run_integration_test(pretrain_config)
            losses = results.get_losses()
            print(losses)

            # Convert checkpoint to sliced checkpoint for easy loading
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch"
            )
            checkpoint_path = os.path.join(pretrain_dir, "checkpoint_sliced.torch")

            # Create a directory to contain the extracted features
            with in_temporary_directory() as extract_dir:

                # Extract head features
                extract_config_head = self._create_extract_features_config_head(
                    checkpoint_path=checkpoint_path, with_fsdp=True
                )
                extract_config_head.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                run_integration_test(
                    extract_config_head, engine_name="extract_features"
                )

                # Extract trunk features
                extract_config_trunk = self._create_extract_features_config_trunk(
                    checkpoint_path=checkpoint_path, with_fsdp=True
                )
                extract_config_trunk.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir
                run_integration_test(
                    extract_config_trunk, engine_name="extract_features"
                )

                # Verify that we can merge the heads features back
                train_feat = ExtractedFeaturesLoader.load_features(
                    extract_dir, "train", "heads", flatten_features=True
                )
                self.assertEqual(train_feat["features"].shape, torch.Size([200, 128]))
                self.assertEqual(train_feat["targets"].shape, torch.Size([200, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([200]))

                # Verify that we can merge the trunk features back
                train_feat = ExtractedFeaturesLoader.load_features(
                    extract_dir, "train", "res5", flatten_features=True
                )
                self.assertEqual(
                    train_feat["features"].shape, torch.Size([200, 3024 * 2 * 2])
                )
                self.assertEqual(train_feat["targets"].shape, torch.Size([200, 1]))
                self.assertEqual(train_feat["inds"].shape, torch.Size([200]))

                # Run KNN on the res5 layer
                extract_config_trunk.NEAREST_NEIGHBOR.FEATURES.PATH = extract_dir
                top_1_ref, top_5_ref, total_ref = run_knn_at_layer(
                    extract_config_trunk, layer_name="res5"
                )
                top_1_opt, top_5_opt, total_opt = run_knn_at_layer_low_memory(
                    extract_config_trunk, layer_name="res5"
                )
                self.assertEqual(total_ref, total_opt)
                # TODO - investigate: both KNN implementation have a bit of randomness
                #  in their accuracies, so the asserts are inequalities.
                self.assertLessEqual(top_1_ref, 30.0)
                self.assertLessEqual(top_1_opt, 30.0)
                self.assertGreaterEqual(top_1_ref, 29.0)
                self.assertGreaterEqual(top_1_opt, 29.0)
                # self.assertEqual(top_1_ref, top_1_opt)
                # self.assertEqual(top_5_ref, top_5_opt)

                # Run KNN on the head layer
                extract_config_head.NEAREST_NEIGHBOR.FEATURES.PATH = extract_dir
                top_1_ref, top_5_ref, total_ref = run_knn_at_layer(
                    extract_config_head, layer_name="heads"
                )
                top_1_opt, top_5_opt, total_opt = run_knn_at_layer_low_memory(
                    extract_config_head, layer_name="heads"
                )
                self.assertEqual(total_ref, total_opt)
                # TODO - investigate: both KNN implementation have a bit of randomness
                #  in their accuracies, so the asserts are inequalities.
                self.assertLessEqual(top_1_ref, 35.0)
                self.assertLessEqual(top_1_opt, 35.0)
                self.assertGreaterEqual(top_1_ref, 33.0)
                self.assertGreaterEqual(top_1_opt, 33.0)