def main(args: Namespace, cfg: AttrDict): setup_logging(__name__, output_dir=get_checkpoint_folder(cfg)) # Extract the features if the feature extract is enabled if cfg.CLUSTERFIT.FEATURES.EXTRACT: # We cannot have automatic extraction with more than 1 node or otherwise # we would have to run this script on several nodes and thus have several # parallel clustering of the features. The automatic extraction is only # there as a shortcut when running on a single node assert (cfg.DISTRIBUTED.NUM_NODES == 1 ), "Automatic extraction can only work with 1 node" # Make sure to dump the features at the desired path cfg.CHECKPOINT.DIR = cfg.CLUSTERFIT.FEATURES.PATH cfg.CHECKPOINT.APPEND_DISTR_RUN_ID = False # Run the extraction of features set_env_vars(local_rank=0, node_id=0, cfg=cfg) logging.info("Setting seed....") set_seeds(cfg, args.node_id) launch_distributed( cfg, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # Else setup the path manager (done in set_env_vars) in # case of feature extraction above else: setup_path_manager() cluster_features(cfg) shutdown_logging()
def main(args: Namespace, cfg: AttrDict): # setup logging setup_logging(__name__) # print the cfg print_cfg(cfg) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) output_dir = get_checkpoint_folder(cfg) assert cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, ( "Feature eval mode is not ON. Can't run train_svm. " "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True " "in your config or from command line.") extract_low_shot_features(args, cfg, output_dir) # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(cfg.MODEL) if len(layers) == 0: layers = ["heads"] # train low shot svm for each layer. output = {} for layer in layers: results = train_svm_low_shot(cfg, output_dir, layer) output[layer] = results logging.info(f"Results: {output}") # close the logging streams including the filehandlers shutdown_logging()
def train_sample_places_low_shot( low_shot_trainer: SVMLowShotTrainer, k_values: List[int], sample_inds: List[int], sample_num: int, output_dir: str, layername: str, cfg: AttrDict, ): # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) for low_shot_kvalue in k_values: checkpoint_dir = f"{output_dir}/sample{sample_num}_k{low_shot_kvalue}" train_data = merge_features(checkpoint_dir, "train", layername) train_features = train_data["features"] train_targets = train_data["targets"] checkpoint_dir = f"{output_dir}/sample{sample_inds[0]}_k{k_values[0]}" test_data = merge_features(checkpoint_dir, "test", layername) test_features = test_data["features"] test_targets = test_data["targets"] low_shot_trainer.train(train_features, train_targets, sample_num, low_shot_kvalue) low_shot_trainer.test(test_features, test_targets, sample_num, low_shot_kvalue)
def main(args: Namespace, config: AttrDict): # setup logging setup_logging(__name__) # print the coniguration used print_cfg(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # Extract the features if no path to the extract features is provided if not config.NEAREST_NEIGHBOR.FEATURES.PATH: launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config) # Run KNN at all the extract features run_knn_at_all_layers(config) # close the logging streams including the filehandlers shutdown_logging()
def main(args: Namespace, config: AttrDict): # setup logging setup_logging(__name__) # print the coniguration used print_cfg(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # extract the features launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # Get the names of the features that we are extracting. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. feat_names = get_trunk_output_feature_names(config.MODEL) if len(feat_names) == 0: feat_names = ["heads"] for layer in feat_names: top1, top5 = nearest_neighbor_test(config, layer_name=layer) logging.info(f"layer: {layer} Top1: {top1}, Top5: {top5}") # close the logging streams including the filehandlers shutdown_logging()
def extract_clusters( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes model visualisation extraction workflow on one node """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Build the SSL trainer to set up distributed training and then # extract the cluster assignments for all entries in the dataset trainer = SelfSupervisionTrainer(cfg, dist_run_id) cluster_assignments = trainer.extract_clusters() # Save the cluster assignments in the output folder if dist_rank == 0: ClusterAssignmentLoader.save_cluster_assignment( output_dir=get_checkpoint_folder(cfg), assignments=ClusterAssignment( config=cfg, cluster_assignments=cluster_assignments), ) # close the logging streams including the file handlers logging.info("All Done!") shutdown_logging()
def setup_pathmanager(): """ Setup PathManager. A bit hacky -- we use the #set_env_vars method to setup pathmanager and as such we need to create a dummy config, and dummy values for local_rank and node_id. """ with initialize_config_module(config_module="vissl.config"): cfg = compose( "defaults", overrides=["config=test/integration_test/quick_swav"], ) config = AttrDict(cfg).config set_env_vars(local_rank=0, node_id=0, cfg=config)
def main(args: Namespace, config: AttrDict): # setup the logging setup_logging(__name__) # print the config print_cfg(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) instance_retrieval_test(args, config) # close the logging streams including the filehandlers shutdown_logging()
def main(args: Namespace, config: AttrDict): config = validate_and_infer_config(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # setup the logging checkpoint_folder = get_checkpoint_folder(config) setup_logging(__name__, output_dir=checkpoint_folder) # print the config print_cfg(config) instance_retrieval_test(args, config) # close the logging streams including the filehandlers shutdown_logging()
def train_svm(cfg: AttrDict, output_dir: str, layername: str): # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) # train the svm logging.info(f"Training SVM for layer: {layername}") trainer = SVMTrainer(cfg["SVM"], layer=layername, output_dir=output_dir) train_data = merge_features(output_dir, "train", layername) train_features, train_targets = train_data["features"], train_data["targets"] trainer.train(train_features, train_targets) # test the svm test_data = merge_features(output_dir, "test", layername) test_features, test_targets = test_data["features"], test_data["targets"] trainer.test(test_features, test_targets) logging.info("All Done!")
def main(args: Namespace, config: AttrDict, node_id=0): config = validate_and_infer_config(config) # setup the environment variables set_env_vars(local_rank=0, node_id=node_id, cfg=config) # setup the logging checkpoint_folder = get_checkpoint_folder(config) setup_logging(__name__, output_dir=checkpoint_folder, rank=os.environ["RANK"]) if (config.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE and not config.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR): # extract the train/database features. config = adapt_train_database_extract_config(config, checkpoint_folder) logging.info("Beginning extract features for database set.") launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # extract the query features. config = adapt_query_extract_config(config, checkpoint_folder) logging.info("Beginning extract features for query set.") launch_distributed( config, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # print the config print_cfg(config) instance_retrieval_test(args, config) logging.info(f"Performance time breakdow:\n{PERF_STATS.report_str()}") # close the logging streams including the filehandlers shutdown_logging()
def _test_synch_bn_pytorch_worker(gpu_id: int, world_size: int, group_size: int, sync_file: str): torch.cuda.set_device(gpu_id) init_distributed_on_file(world_size=world_size, gpu_id=gpu_id, sync_file=sync_file) config = AttrDict({ "MODEL": { "SYNC_BN_CONFIG": { "SYNC_BN_TYPE": "pytorch", "GROUP_SIZE": group_size, } }, "DISTRIBUTED": { "NUM_PROC_PER_NODE": world_size, "NUM_NODES": 1, "NCCL_DEBUG": False, "NCCL_SOCKET_NTHREADS": 4, }, }) set_env_vars(local_rank=gpu_id, node_id=0, cfg=config) channels = 8 model = nn.Sequential( nn.BatchNorm2d(num_features=channels), nn.AdaptiveAvgPool2d(output_size=(1, 1)), ) model = convert_sync_bn(config, model).cuda(gpu_id) model = DistributedDataParallel(model, device_ids=[gpu_id]) x = torch.full(size=(5, channels, 4, 4), fill_value=float(gpu_id)) model(x) running_mean = model.module[0].running_mean.cpu() print(gpu_id, running_mean) if group_size == 1: if gpu_id == 0: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.0)) elif gpu_id == 1: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.1)) else: if gpu_id in {0, 1}: assert torch.allclose(running_mean, torch.full(size=(8, ), fill_value=0.05))
def extract_features_and_run_knn(node_id: int, config: AttrDict): setup_logging(__name__) print_cfg(config) set_env_vars(local_rank=0, node_id=0, cfg=config) # Extract the features if no path to the extract features is provided if not config.NEAREST_NEIGHBOR.FEATURES.PATH: launch_distributed( config, node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config) # Run KNN on all the extract features run_knn_at_all_layers(config) # close the logging streams including the file handlers shutdown_logging()
def train_svm(cfg: AttrDict, output_dir: str, layername: str): # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) features_dir = cfg.SVM_FEATURES_PATH # train the svm logging.info(f"Training SVM for layer: {layername}") trainer = SVMTrainer(cfg["SVM"], layer=layername, output_dir=output_dir) train_data = ExtractedFeaturesLoader.load_features(features_dir, "train", layername, flatten_features=True) trainer.train(train_data["features"], train_data["targets"]) # test the svm test_data = ExtractedFeaturesLoader.load_features(features_dir, "test", layername, flatten_features=True) trainer.test(test_data["features"], test_data["targets"]) logging.info("All Done!")
def main(args: Namespace, config: AttrDict): # setup logging setup_logging(__name__) # print the coniguration used print_cfg(config) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=config) # extract the label predictions on the test set launch_distributed( config, args.node_id, engine_name="extract_label_predictions", hook_generator=default_hook_generator, ) geolocalization_test(config) # close the logging streams including the filehandlers shutdown_logging()
def main(args: Namespace, cfg: AttrDict): # setup logging setup_logging(__name__) # setup the environment variables set_env_vars(local_rank=0, node_id=0, cfg=cfg) # set seeds logging.info("Setting seed....") set_seeds(cfg, args.node_id) # extract the features. We enable the feature extraction as well. launch_distributed( cfg, args.node_id, engine_name="extract_features", hook_generator=default_hook_generator, ) # cluster the extracted features cluster_features_and_label(args, cfg) # close the logging streams including the filehandlers shutdown_logging()
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus of the current node according to the cfg. If more than 1 nodes are needed for training, this function should be called on each of the different nodes, each time with an unique node_id in the range [0..N-1] if N is the total number of nodes to take part in training. Alternatively, you can use SLURM or any cluster management system to run this function for you. Configure the node_id, dist_run_id, setup the environment variabled Args: cfg (AttrDict): VISSL yaml configuration node_id (int): node_id for this node engine_name (str): what engine to run: train or extract_features hook_generator (Callable): Callback to generate all the ClassyVision hooks for this engine """ setup_logging(__name__) node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) _copy_to_local(cfg) # given the checkpoint folder, we check that there's not already a final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info(f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to load from. The load_checkpoints function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists( symlink_checkpoint_path ): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder ) try: if world_size > 1: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) else: _distributed_worker( local_rank=0, cfg=cfg, node_id=node_id, dist_run_id=dist_run_id, engine_name=engine_name, checkpoint_path=checkpoint_path, checkpoint_folder=checkpoint_folder, hook_generator=hook_generator, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: _cleanup_local_dir(cfg) logging.info("All Done!") shutdown_logging()
def extract_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") for layer_name, layer_features in features[split].items(): out_feat_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_features.npy") out_target_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_targets.npy") out_inds_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_inds.npy") feat_shape = layer_features["features"].shape logging.info( f"Saving extracted features of {layer_name} with shape {feat_shape} to: {out_feat_file}" ) save_file(layer_features["features"], out_feat_file) logging.info( f"Saving extracted targets of {layer_name} to: {out_target_file}" ) save_file(layer_features["targets"], out_target_file) logging.info( f"Saving extracted indices of {layer_name} to: {out_inds_file}" ) save_file(layer_features["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_label_predictions_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes label predictions workflow per machine. Runs the model in eval mode only to extract the label predicted per class. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant for the feature extraction. dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) # setup the multiprocessing to be forkserver. See https://fb.quip.com/CphdAGUaM5Wf logging.info( f"Setting multiprocessing method: {cfg.MULTI_PROCESSING_METHOD}") setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings # print the environment info for the current node logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) print_cfg(cfg) logging.info(f"System config:\n{collect_env_info()}") # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_label_hook_generator(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=False, extract_predictions=True, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def train_main( cfg: AttrDict, dist_run_id: str, checkpoint_path: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, hook_generator: Callable[[Any], List[ClassyHook]] = default_hook_generator, ): """ Sets up and executes training workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_path (str): if the training is being resumed from a checkpoint, path to the checkpoint. The tools/run_distributed_engines.py automatically looks for the checkpoint in the checkpoint directory. checkpoint_folder (str): what directory to use for checkpointing. The tools/run_distributed_engines.py creates the directory based on user input in the yaml config file. local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu hook_generator (Callable): The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # get the hooks - these hooks are executed per replica hooks = hook_generator(cfg) # build the SSL trainer. The trainer first prepares a "task" object which # acts as a container for various things needed in a training: datasets, # dataloader, optimizers, losses, hooks, etc. "Task" will also have information # about phases (train, test) both. The trainer then sets up distributed # training. trainer = SelfSupervisionTrainer( cfg, dist_run_id, checkpoint_path, checkpoint_folder, hooks ) trainer.train() logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus, according to the cfg Args: cfg -- VISSL yaml configuration node_id -- node_id for this node engine_name -- what engine to run: train or extract_features hook_generator -- Callback to generate all the ClassyVision hooks for this engine """ node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) copy_to_local(cfg) # given the checkpoint folder, we check that there's not already a final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to load from. The load_checkpoints function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) try: if world_size > 1: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) else: _distributed_worker( local_rank=0, cfg=cfg, node_id=node_id, dist_run_id=dist_run_id, engine_name=engine_name, checkpoint_path=checkpoint_path, checkpoint_folder=checkpoint_folder, hook_generator=hook_generator, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: cleanup_local_dir(cfg) logging.info("All Done!")
def extract_main(cfg: AttrDict, dist_run_id: str, local_rank: int = 0, node_id: int = 0): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup logging setup_logging(__name__) # setup the environment variables set_env_vars(local_rank, node_id, cfg) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg) # print the training settings and system settings local_rank, _ = get_machine_local_and_dist_rank() if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) output_dir = get_checkpoint_folder(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") layers = features[split].keys() for layer in layers: out_feat_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy") out_target_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy") out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy" logging.info("Saving extracted features: {} {} to: {}".format( layer, features[split][layer]["features"].shape, out_feat_file)) save_file(features[split][layer]["features"], out_feat_file) logging.info("Saving extracted targets: {} to: {}".format( features[split][layer]["targets"].shape, out_target_file)) save_file(features[split][layer]["targets"], out_target_file) logging.info("Saving extracted indices: {} to: {}".format( features[split][layer]["inds"].shape, out_inds_file)) save_file(features[split][layer]["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_features_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_folder (str): what directory to use for checkpointing. This folder will be used to output the extracted features as well in case config.EXTRACT_FEATURES.OUTPUT_DIR is not set local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_features_hook_generator(cfg) # Run the label prediction extraction trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) output_dir = cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=True, extract_predictions=False, ) # TODO (prigoyal): merge this function with _extract_features if dist_rank == 0 and cfg.EXTRACT_FEATURES.MAP_FEATURES_TO_IMG_NAME: # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(cfg.MODEL) if len(layers) == 0: layers = ["heads"] available_splits = [ item.lower() for item in trainer.task.available_splits ] for split in available_splits: image_paths = trainer.task.datasets[split].get_image_paths()[0] for layer in layers: ExtractedFeaturesLoader.map_features_to_img_filepath( image_paths=image_paths, input_dir=output_dir, split=split, layer=layer, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def launch_distributed( cfg: AttrDict, node_id: int, engine_name: str, hook_generator: Callable[[Any], List[ClassyHook]], ): """ Launch the distributed training across gpus of the current node according to the cfg. If more than 1 nodes are needed for training, this function should be called on each of the different nodes, each time with an unique node_id in the range [0..N-1] if N is the total number of nodes to take part in training. Alternatively, you can use SLURM or any cluster management system to run this function for you. Configure the node_id, dist_run_id, setup the environment variabled Args: cfg (AttrDict): VISSL yaml configuration node_id (int): node_id for this node engine_name (str): what engine to run: train or extract_features hook_generator (Callable): Callback to generate all the ClassyVision hooks for this engine """ setup_logging(__name__) node_id = get_node_id(node_id) dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES) # If using gpus, we check that the user has specified <= gpus available on user system. if cfg.MACHINE.DEVICE == "gpu": assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count( ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus " f"available. Number of gpus found on user system={torch.cuda.device_count()}. " "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.") # set the environment variables including local rank, node id etc. set_env_vars(local_rank=0, node_id=node_id, cfg=cfg) # given the checkpoint folder, we check that there's not already a final checkpoint # and that if there already exists a final checkpoint and user is not overriding # to ignore the final checkpoint checkpoint_folder = get_checkpoint_folder(cfg) if is_training_finished(cfg, checkpoint_folder=checkpoint_folder): logging.info( f"Training already succeeded on node: {node_id}, exiting.") return # Get the checkpoint where to resume from. The get_resume_checkpoint function will # automatically take care of detecting whether it's a resume or not. symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists( symlink_checkpoint_path): checkpoint_path = f"{checkpoint_folder}/checkpoint.torch" else: checkpoint_path = get_resume_checkpoint( cfg, checkpoint_folder=checkpoint_folder) # assert that if the user set the PARAMS_FILE, it must exist and be valid. # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the # given training. This ensures that if the same training resumes, then it # resumes from the checkpoint and not the weight init if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]: params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"] error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}" assert g_pathmgr.exists(params_file), error_message # copy the data to local if user wants. This can speed up dataloading. _copy_to_local(cfg) try: torch.multiprocessing.spawn( _distributed_worker, nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, args=( cfg, node_id, dist_run_id, engine_name, checkpoint_path, checkpoint_folder, hook_generator, ), daemon=False, ) except (KeyboardInterrupt, RuntimeError) as e: logging.error("Wrapping up, caught exception: ", e) if isinstance(e, RuntimeError): raise e finally: _cleanup_local_dir(cfg) logging.info("All Done!") shutdown_logging()