def test_fsdp_integration_with_linear_eval(self): with in_temporary_directory() as pretrain_dir: # Start pre-training config = self._create_pretraining_config( with_fsdp=True, with_activation_checkpointing=True, with_mixed_precision=False, auto_wrap_threshold=0, ) run_integration_test(config) # Consolidate the weights CheckpointFormatConverter.sharded_to_consolidated_checkpoint( "checkpoint.torch", "checkpoint_conso.torch") # Load the checkpoint and perform a linear evaluation on it losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, "checkpoint_conso.torch"), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(losses)) print(losses)
def run_cluster_assignment(self, with_fsdp: bool): with in_temporary_directory() as pretrain_dir: # Pre-train a SwAV model in order to get some weights pretrain_config = self._create_pretraining_config(with_fsdp=with_fsdp) run_integration_test(pretrain_config) # Extract the cluster assignments of each sample with in_temporary_directory() as extract_dir: extract_config = self._create_extract_cluster_config( with_fsdp=with_fsdp, checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), ) run_integration_test(extract_config, engine_name="extract_cluster") self.assertIn("cluster_assignments.torch", os.listdir(extract_dir)) shutil.move( src=os.path.join(extract_dir, "cluster_assignments.torch"), dst=os.path.join(pretrain_dir, "cluster_assignments.torch"), ) # Load the cluster assignments and check their structure assignments = ClusterAssignmentLoader.load_cluster_assigment( "cluster_assignments.torch" ) self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"])) self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
def run_benchmarking_preemption_test( self, checkpoint_path: str, with_fsdp: bool, with_eval_mlp: bool, num_gpu: int = 2, ): with in_temporary_directory() as temp_dir: config = self._create_benchmark_config( checkpoint_path, with_fsdp=with_fsdp, with_eval_mlp=with_eval_mlp, num_gpu=num_gpu, ) config.CHECKPOINT.DIR = temp_dir results = run_integration_test(config) initial_losses = results.get_losses() results.clean_final_checkpoint() results.clean_logs() results = run_integration_test(config) restart_losses = results.get_losses() print("INITIAL:", initial_losses) print("RESTART:", restart_losses) self.assertEqual(initial_losses[5:], restart_losses)
def test_fine_tuning_end_to_end_fsdp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config( with_fsdp=True, fsdp_flatten_parameters=True) run_integration_test(pretrain_config) sharded_checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch") sliced_checkpoint_path = os.path.join(pretrain_dir, "sliced.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( input_checkpoint_path=sharded_checkpoint_path, output_checkpoint_path=sliced_checkpoint_path, ) # Create a separate directly in which to run the fine-tuning with in_temporary_directory(): finetune_config = self._create_finetuning_config( sliced_checkpoint_path, construct_single_param_group_only=False, regularize_bias=False, with_fsdp=True, fsdp_flatten_parameters=False, ) result = run_integration_test(finetune_config) accuracies = result.get_accuracies(from_metrics_file=True) self.assertEqual(4, len(accuracies))
def test_augly_transforms(self): cfg = compose_hydra_configuration([ "config=test/cpu_test/test_cpu_resnet_simclr.yaml", "+config/test/transforms=augly_transforms_example", ], ) _, config = convert_to_attrdict(cfg) with in_temporary_directory() as _: # Test that the training runs with an augly transformation. run_integration_test(config)
def test_legacy_profiler(self): with in_temporary_directory() as output_dir: config = self._create_config(force_legacy_profiler=True) run_integration_test(config) files = set(os.listdir(output_dir)) print(files) self.assertIn("cuda_time_rank0.txt", files) self.assertIn("cuda_memory_usage_rank0.txt", files) self.assertIn("cpu_time_rank0.txt", files) self.assertIn("profiler_chrome_trace_rank0.json", files)
def test_benchmarking_from_sharded_checkpoint(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in FSDP mode and save a sharded checkpoing config = self._create_pretraining_config(with_fsdp=True) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Verify that FSDP can load the checkpoint and run a benchmark on it fsdp_losses, fsdp_accuracies = self.run_benchmarking( checkpoint_path, with_fsdp=True) self.assertGreaterEqual(len(fsdp_losses), 0) self.assertEqual(4, len(fsdp_accuracies))
def run_cluster_assignment(self, with_fsdp: bool): with in_temporary_directory() as pretrain_dir: # Pre-train a SwAV model in order to get some weights pretrain_config = self._create_pretraining_config( with_fsdp=with_fsdp) run_integration_test(pretrain_config) # Extract the cluster assignments of each sample with in_temporary_directory() as extract_dir: extract_config = self._create_extract_cluster_config( with_fsdp=with_fsdp, checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), ) extract_config.EXTRACT_FEATURES.CHUNK_THRESHOLD = 10 run_integration_test(extract_config, engine_name="extract_cluster") extraction_outputs = os.listdir(extract_dir) # Check that the cluster assignments are computed in both # compact format and dataset disk_filelist format self.assertIn("cluster_assignments.torch", extraction_outputs) self.assertIn("train_images.npy", extraction_outputs) self.assertIn("train_labels.npy", extraction_outputs) self.assertIn("test_images.npy", extraction_outputs) self.assertIn("test_labels.npy", extraction_outputs) # Check that the soft assignments (on prototypes) are exported for rank in range(2): for chunk in range(2): file_name = f"rank{rank}_chunk{chunk}_train_heads_protos.npy" self.assertIn(file_name, extraction_outputs) self.assertEqual(np.load(file_name).shape[1], 3000) file_name = f"rank{rank}_chunk0_test_heads_protos.npy" self.assertIn(file_name, extraction_outputs) self.assertEqual(np.load(file_name).shape[1], 3000) # Copy the cluster assignments shutil.move( src=os.path.join(extract_dir, "cluster_assignments.torch"), dst=os.path.join(pretrain_dir, "cluster_assignments.torch"), ) # Load the cluster assignments and check their structure assignments = ClusterAssignmentLoader.load_cluster_assigment( "cluster_assignments.torch") self.assertEqual(40, len(assignments.cluster_assignments["TRAIN"])) self.assertEqual(20, len(assignments.cluster_assignments["TEST"]))
def test_fsdp_integration_with_linear_eval(self): with in_temporary_directory() as pretrain_dir: # Start pre-training config = self._create_pretraining_config( with_fsdp=True, with_activation_checkpointing=True, with_mixed_precision=False, auto_wrap_threshold=0, ) run_integration_test(config) # Consolidate the weights (3 different ways) CheckpointFormatConverter.sharded_to_consolidated_checkpoint( "checkpoint.torch", "checkpoint_conso.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( "checkpoint.torch", "checkpoint_sliced.torch") CheckpointFormatConverter.consolidated_to_sliced_checkpoint( "checkpoint_conso.torch", "checkpoint_sliced_2.torch") # Load the sharded checkpoint and perform a inear evaluation on it ref_losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(ref_losses)) # Then check that the results are the same for the other kind of # checkpoints after consolidation has taken place for checkpoint_name in [ "checkpoint_conso.torch", "checkpoint_sliced.torch", "checkpoint_sliced_2.torch", ]: losses = self.run_linear_eval( checkpoint_path=os.path.join(pretrain_dir, checkpoint_name), with_fsdp=True, with_mixed_precision=False, auto_wrap_threshold=0, ) self.assertEqual(8, len(losses)) self.assertAlmostEqual( losses[0], ref_losses[0], places=4, msg=f"Failed for {checkpoint_name}", )
def test_benchmarking_from_sharded_checkpoint_with_preemption(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in FSDP mode and save a sharded checkpoing config = self._create_pretraining_config(with_fsdp=True) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Verify that FSDP can load the checkpoint and run a benchmark on it # and that it can restart from a preemption of the benchmark self.run_benchmarking_preemption_test( checkpoint_path, with_fsdp=True, with_eval_mlp=True ) self.run_benchmarking_preemption_test( checkpoint_path, with_fsdp=True, with_eval_mlp=False )
def run_preemption_test(self, config: AttrDict, compare_losses: bool = True): initial_result = run_integration_test(config) initial_iters, initial_losses = initial_result.get_losses_with_iterations() initial_result.clean_final_checkpoint() initial_result.clean_logs() restart_result = run_integration_test(config) restart_iters, restart_losses = restart_result.get_losses_with_iterations() print("INITIAL:", initial_iters, initial_losses) print("RESTART:", restart_iters, restart_losses) self.assertEqual(initial_iters[-len(restart_iters) :], restart_iters) if compare_losses: self.assertEqual(initial_losses[-len(restart_losses) :], restart_losses)
def test_benchmarking_from_a_consolidated_checkpoint(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in DDP mode and save a consolidated checkpoint config = self._create_pretraining_config(with_fsdp=False) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Now, run both DDP and FSDP linear evaluation and compare the traces ddp_losses, ddp_accuracies = self.run_benchmarking(checkpoint_path, with_fsdp=False) fsdp_losses, fsdp_accuracies = self.run_benchmarking( checkpoint_path, with_fsdp=True) self.assertEqual(ddp_losses, fsdp_losses) self.assertEqual(ddp_accuracies, fsdp_accuracies)
def test_extract_cluster_assignment_ddp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config() run_integration_test(pretrain_config) # Create a directory to contain the extracted features with in_temporary_directory() as extract_dir: # Run the extract engine in a separate directory to check that # it is correctly able to output the feature in a another dir with in_temporary_directory(): extract_config = self._create_extract_features_config( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch")) extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test(extract_config, engine_name="extract_features") # Check the content of the directory containing the extracted dirs folder_content = os.listdir(extract_dir) print(folder_content) for rank in [0, 1]: for chunk in range(5): for file in [ f"rank{rank}_chunk{chunk}_train_heads_features.npy", f"rank{rank}_chunk{chunk}_train_heads_inds.npy", f"rank{rank}_chunk{chunk}_train_heads_targets.npy", ]: self.assertIn(file, folder_content) # Verify that we can merge the features back (train split) train_feat = merge_features(extract_dir, "train", "heads") print(train_feat) self.assertEqual(train_feat["features"].shape, torch.Size([40, 128])) self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([40])) # Verify that we can merge the features back (test split) test_feat = merge_features(extract_dir, "test", "heads") self.assertEqual(test_feat["features"].shape, torch.Size([20, 128])) self.assertEqual(test_feat["targets"].shape, torch.Size([20, 1])) self.assertEqual(test_feat["inds"].shape, torch.Size([20]))
def run_config(self, config, with_memory: bool = False): with in_temporary_directory(): result = run_integration_test(config) losses = result.get_losses() if with_memory: return losses, result.get_peak_memory() return losses
def test_fine_tuning_end_to_end(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config() run_integration_test(pretrain_config) checkpoint_path = os.path.join(pretrain_dir, "checkpoint.torch") # Create a separate directly in which to run the fine-tuning with in_temporary_directory(): finetune_config = self._create_finetuning_config( checkpoint_path, construct_single_param_group_only=False, regularize_bias=False, ) result = run_integration_test(finetune_config) accuracies = result.get_accuracies(from_metrics_file=True) self.assertEqual(4, len(accuracies))
def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool, num_gpu: int = 2): with in_temporary_directory() as temp_dir: config = self._create_benchmark_config( checkpoint_path, with_fsdp=with_fsdp, num_gpu=num_gpu ) config.CHECKPOINT.DIR = temp_dir results = run_integration_test(config) return results.get_losses(), results.get_accuracies(from_metrics_file=True)
def test_regnet_10b_swav_pretraining(self): with in_temporary_directory(): config = self._create_10B_pretrain_config( num_gpus=8, num_steps=2, batch_size=4 ) results = run_integration_test(config) losses = results.get_losses() print(losses) self.assertEqual(len(losses), 2)
def test_prehemption_during_training(self): with in_temporary_directory() as temp_dir: config = self._create_dino_pretraining_config( with_mixed_precision=False, gpu_count=2) result = run_integration_test(config) losses_before = result.get_losses() temp_dir_content = os.listdir(temp_dir) self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content) os.remove("model_final_checkpoint_phase3.torch") os.remove("checkpoint.torch") os.remove("log.txt") result = run_integration_test(config) losses_after = result.get_losses() print(losses_before) print(losses_after) self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=5)
def test_regnet_10b_evaluation(self): with in_temporary_directory(): cp_path = "/checkpoint/qduval/vissl/seer/regnet10B_sliced/model_iteration124500_sliced.torch" config = self._create_10B_evaluation_config( num_gpus=8, num_steps=2, batch_size=4, path_to_sliced_checkpoint=cp_path) results = run_integration_test(config) losses = results.get_losses() print(losses) self.assertGreater(len(losses), 0)
def run_pretraining( self, with_fsdp: bool, with_activation_checkpointing: bool, with_mixed_precision: bool, ): with in_temporary_directory(): config = self._create_pretraining_config( with_fsdp=with_fsdp, with_activation_checkpointing=with_activation_checkpointing, with_mixed_precision=with_mixed_precision, ) result = run_integration_test(config) return result.get_losses()
def test_dino_xcit_prehemption(self): with in_temporary_directory() as temp_dir: config = self._create_dino_pretraining_config( with_mixed_precision=False, gpu_count=2 ) # For deterministic computing config.MODEL.TRUNK.XCIT.DROP_PATH_RATE = 0.0 result = run_integration_test(config) losses_before = result.get_losses() temp_dir_content = os.listdir(temp_dir) self.assertIn("model_final_checkpoint_phase3.torch", temp_dir_content) os.remove("model_final_checkpoint_phase3.torch") os.remove("checkpoint.torch") os.remove("log.txt") result = run_integration_test(config) losses_after = result.get_losses() print(losses_before) print(losses_after) self.assertAlmostEqual(losses_after[-1], losses_before[-1], places=4)
def test_pretraining_and_evaluation(self): with in_temporary_directory() as pretrain_dir: config = self._create_dino_pretraining_config( with_mixed_precision=True, gpu_count=2, num_epochs=1) result = run_integration_test(config) ddp_losses = result.get_losses() self.assertGreater(len(ddp_losses), 0) eval_config = self._create_dino_linear_eval_config( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch"), gpu_count=2, ) eval_losses = self.run_config(eval_config) print(eval_losses)
def run_linear_eval( self, checkpoint_path: str, with_fsdp: bool, with_mixed_precision: bool, auto_wrap_threshold: int = 0, ): with in_temporary_directory(): config = self._create_linear_evaluation_config( with_fsdp=with_fsdp, with_mixed_precision=with_mixed_precision, auto_wrap_threshold=auto_wrap_threshold, ) config.MODEL.WEIGHTS_INIT.PARAMS_FILE = checkpoint_path result = run_integration_test(config) return result.get_losses()
def run_pretraining( self, with_fsdp: bool, with_activation_checkpointing: bool, with_mixed_precision: bool, auto_wrap_threshold: int = 0, force_sync_all_gather: bool = False, ): with in_temporary_directory(): config = self._create_pretraining_config( with_fsdp=with_fsdp, with_activation_checkpointing=with_activation_checkpointing, with_mixed_precision=with_mixed_precision, auto_wrap_threshold=auto_wrap_threshold, force_sync_all_gather=force_sync_all_gather, ) result = run_integration_test(config) return result.get_losses()
def test_ema_hook(self): cfg = compose_hydra_configuration( [ "config=test/integration_test/quick_eval_in1k_linear.yaml", "config.DATA.TRAIN.DATA_SOURCES=[synthetic]", "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]", "config.DATA.TEST.DATA_SOURCES=[synthetic]", "config.DATA.TEST.LABEL_SOURCES=[synthetic]", "config.DATA.TRAIN.DATA_LIMIT=40", "config.OPTIMIZER.num_epochs=2", "config.HOOKS.EMA_MODEL.SAVE_EMA_MODEL=True", "config.HOOKS.EMA_MODEL.ENABLE_EMA_METERS=True", "config.HOOKS.EMA_MODEL.EMA_DEVICE=gpu", ], ) _, config = convert_to_attrdict(cfg) with in_temporary_directory() as checkpoint_folder: # Run a quick_eval_in1k_linear. integration_logs = run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # Test that the ema model is saved in the checkpoint. checkpoint = load_checkpoint(checkpoint_path) self.assertTrue( "ema_model" in checkpoint["classy_state_dict"].keys(), msg="ema_model has not been saved to the checkpoint folder.", ) # Test that train_accuracy_list_meter_ema have been logged to metrics.json. metrics = integration_logs.get_accuracies(from_metrics_file=True) self.assertTrue( "train_accuracy_list_meter_ema" in metrics[1], msg="train_accuracy_list_meter_ema is not logged to the metrics.json file.", ) self.assertEqual( len(metrics), 8, "the metrics.json output does not have the appropriate number of entries.", )
def run_benchmarking(self, checkpoint_path: str, with_fsdp: bool): with in_temporary_directory(): config = self._create_benchmark_config(checkpoint_path, with_fsdp=with_fsdp) results = run_integration_test(config) return results.get_losses(), results.get_accuracies()
def test_extract_cluster_assignment_ddp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config() run_integration_test(pretrain_config) # Create a directory to contain the extracted features with in_temporary_directory() as extract_dir: # Run the extract engine in a separate directory to check that # it is correctly able to output the feature in a another dir with in_temporary_directory(): extract_config = self._create_extract_features_config_head( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch") ) extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test(extract_config, engine_name="extract_features") # Check the content of the directory containing the extracted dirs folder_content = os.listdir(extract_dir) print(folder_content) for rank in [0, 1]: for chunk in range(5): for file in [ f"rank{rank}_chunk{chunk}_train_heads_features.npy", f"rank{rank}_chunk{chunk}_train_heads_inds.npy", f"rank{rank}_chunk{chunk}_train_heads_targets.npy", ]: self.assertIn(file, folder_content) # Verify that we can merge the features back (train split) train_feat = merge_features(extract_dir, "train", "heads") print(train_feat) self.assertEqual(train_feat["features"].shape, torch.Size([40, 128])) self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([40])) # Verify that we can merge the features back (test split) test_feat = merge_features(extract_dir, "test", "heads") self.assertEqual(test_feat["features"].shape, torch.Size([20, 128])) self.assertEqual(test_feat["targets"].shape, torch.Size([20, 1])) self.assertEqual(test_feat["inds"].shape, torch.Size([20])) # Run the extract engine this time for the features of the trunk with in_temporary_directory(): extract_config = self._create_extract_features_config_trunk( checkpoint_path=os.path.join(pretrain_dir, "checkpoint.torch") ) extract_config.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test(extract_config, engine_name="extract_features") # Verify that we can merge the features back without flattening them train_feat = merge_features(extract_dir, "train", "res5") self.assertEqual( train_feat["features"].shape, torch.Size([40, 2048, 2, 2]) ) self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([40])) # Verify that we can merge the features back without flattening them (second approach) train_feat = ExtractedFeaturesLoader.load_features( extract_dir, "train", "res5" ) self.assertEqual( train_feat["features"].shape, torch.Size([40, 2048, 2, 2]) ) # Verify that we can merge the features back but flattened train_feat = ExtractedFeaturesLoader.load_features( extract_dir, "train", "res5", flatten_features=True ) self.assertEqual( train_feat["features"].shape, torch.Size([40, 2048 * 2 * 2]) ) self.assertEqual(train_feat["targets"].shape, torch.Size([40, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([40])) # Verify that we can sample the features (unflattened) train_feat = ExtractedFeaturesLoader.sample_features( input_dir=extract_dir, split="train", layer="res5", num_samples=10, seed=0, ) self.assertEqual( train_feat["features"].shape, torch.Size([10, 2048, 2, 2]) ) self.assertEqual(train_feat["targets"].shape, torch.Size([10, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([10])) # Verify that we can sample the features (flattened) train_feat = ExtractedFeaturesLoader.sample_features( input_dir=extract_dir, split="train", layer="res5", num_samples=10, seed=0, flatten_features=True, ) self.assertEqual( train_feat["features"].shape, torch.Size([10, 2048 * 2 * 2]) ) self.assertEqual(train_feat["targets"].shape, torch.Size([10, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([10]))
def run_config(self, config): with in_temporary_directory(): result = run_integration_test(config) return result.get_losses()
def test_benchmarking_with_checkpoint_resharding(self): with in_temporary_directory() as checkpoint_folder: # Run a pre-training in FSDP mode and save a sharded checkpoint config = self._create_pretraining_config(with_fsdp=True) run_integration_test(config) checkpoint_path = os.path.join(checkpoint_folder, "checkpoint.torch") # List the files inside the current working directory # to later test what files have been created files_before_conversion = set(os.listdir(checkpoint_folder)) # Transform the sharded checkpoint to a consolidated checkpoint eval_checkpoint_path_1 = os.path.join(checkpoint_folder, "checkpoint_eval_1.torch") CheckpointFormatConverter.sharded_to_consolidated_checkpoint( input_checkpoint_path=checkpoint_path, output_checkpoint_path=eval_checkpoint_path_1, ) # Transform the sharded checkpoint to a sliced checkpoint eval_checkpoint_path_2 = os.path.join(checkpoint_folder, "checkpoint_eval_2.torch") CheckpointFormatConverter.sharded_to_sliced_checkpoint( input_checkpoint_path=checkpoint_path, output_checkpoint_path=eval_checkpoint_path_2, ) # Verify the content of the directory after checkpoint conversion files_after_conversion = set(os.listdir(checkpoint_folder)) new_files = files_after_conversion - files_before_conversion expected_new_files = { "checkpoint_eval_1.torch", "checkpoint_eval_2.torch", "checkpoint_eval_2_layers", } self.assertEqual( new_files, expected_new_files, "checkpoint 2 slices should be packaged in a directory", ) # Run a benchmark in FSDP mode and record the losses and accuracies eval_losses, eval_accuracies = self.run_benchmarking( checkpoint_path, with_fsdp=True) self.assertGreater(len(eval_losses), 0) self.assertEqual(4, len(eval_accuracies)) # Check that these losses and accuracies are the same with the # consolidated and sliced checkpoints for eval_checkpoint in [ eval_checkpoint_path_1, eval_checkpoint_path_2 ]: fsdp_losses, fsdp_accuracies = self.run_benchmarking( eval_checkpoint, with_fsdp=True) self.assertEqual(fsdp_losses, eval_losses) self.assertEqual(fsdp_accuracies, eval_accuracies) # Check that the consolidated and sliced checkpoints, contrary to # the sharded checkpoint, can be used with a different number of GPUs for eval_checkpoint in [ eval_checkpoint_path_1, eval_checkpoint_path_2 ]: fsdp_losses, fsdp_accuracies = self.run_benchmarking( eval_checkpoint, with_fsdp=True, num_gpu=1) self.assertGreater(len(fsdp_losses), 0) self.assertEqual(len(fsdp_accuracies), 4)
def test_knn_fsdp(self): with in_temporary_directory() as pretrain_dir: # Run a pre-training to have some weights to being with pretrain_config = self._create_pretraining_config(with_fsdp=True) results = run_integration_test(pretrain_config) losses = results.get_losses() print(losses) # Convert checkpoint to sliced checkpoint for easy loading CheckpointFormatConverter.sharded_to_sliced_checkpoint( "checkpoint.torch", "checkpoint_sliced.torch" ) checkpoint_path = os.path.join(pretrain_dir, "checkpoint_sliced.torch") # Create a directory to contain the extracted features with in_temporary_directory() as extract_dir: # Extract head features extract_config_head = self._create_extract_features_config_head( checkpoint_path=checkpoint_path, with_fsdp=True ) extract_config_head.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test( extract_config_head, engine_name="extract_features" ) # Extract trunk features extract_config_trunk = self._create_extract_features_config_trunk( checkpoint_path=checkpoint_path, with_fsdp=True ) extract_config_trunk.EXTRACT_FEATURES.OUTPUT_DIR = extract_dir run_integration_test( extract_config_trunk, engine_name="extract_features" ) # Verify that we can merge the heads features back train_feat = ExtractedFeaturesLoader.load_features( extract_dir, "train", "heads", flatten_features=True ) self.assertEqual(train_feat["features"].shape, torch.Size([200, 128])) self.assertEqual(train_feat["targets"].shape, torch.Size([200, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([200])) # Verify that we can merge the trunk features back train_feat = ExtractedFeaturesLoader.load_features( extract_dir, "train", "res5", flatten_features=True ) self.assertEqual( train_feat["features"].shape, torch.Size([200, 3024 * 2 * 2]) ) self.assertEqual(train_feat["targets"].shape, torch.Size([200, 1])) self.assertEqual(train_feat["inds"].shape, torch.Size([200])) # Run KNN on the res5 layer extract_config_trunk.NEAREST_NEIGHBOR.FEATURES.PATH = extract_dir top_1_ref, top_5_ref, total_ref = run_knn_at_layer( extract_config_trunk, layer_name="res5" ) top_1_opt, top_5_opt, total_opt = run_knn_at_layer_low_memory( extract_config_trunk, layer_name="res5" ) self.assertEqual(total_ref, total_opt) # TODO - investigate: both KNN implementation have a bit of randomness # in their accuracies, so the asserts are inequalities. self.assertLessEqual(top_1_ref, 30.0) self.assertLessEqual(top_1_opt, 30.0) self.assertGreaterEqual(top_1_ref, 29.0) self.assertGreaterEqual(top_1_opt, 29.0) # self.assertEqual(top_1_ref, top_1_opt) # self.assertEqual(top_5_ref, top_5_opt) # Run KNN on the head layer extract_config_head.NEAREST_NEIGHBOR.FEATURES.PATH = extract_dir top_1_ref, top_5_ref, total_ref = run_knn_at_layer( extract_config_head, layer_name="heads" ) top_1_opt, top_5_opt, total_opt = run_knn_at_layer_low_memory( extract_config_head, layer_name="heads" ) self.assertEqual(total_ref, total_opt) # TODO - investigate: both KNN implementation have a bit of randomness # in their accuracies, so the asserts are inequalities. self.assertLessEqual(top_1_ref, 35.0) self.assertLessEqual(top_1_opt, 35.0) self.assertGreaterEqual(top_1_ref, 33.0) self.assertGreaterEqual(top_1_opt, 33.0)