def extract_clusters( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes model visualisation extraction workflow on one node """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Build the SSL trainer to set up distributed training and then # extract the cluster assignments for all entries in the dataset trainer = SelfSupervisionTrainer(cfg, dist_run_id) cluster_assignments = trainer.extract_clusters() # Save the cluster assignments in the output folder if dist_rank == 0: ClusterAssignmentLoader.save_cluster_assignment( output_dir=get_checkpoint_folder(cfg), assignments=ClusterAssignment( config=cfg, cluster_assignments=cluster_assignments), ) # close the logging streams including the file handlers logging.info("All Done!") shutdown_logging()
def get_loader( dataset: GenericSSLDataset, dataset_config: dict, num_dataloader_workers: int, pin_memory: bool, multi_processing_method: str, device: torch.device, sampler_seed=0, get_sampler=get_sampler, worker_init_fn=set_dataloader_seeds, ): """ Get the dataloader for the given satasets and data split Args: dataset (GenericSSLDataset): the dataset object for which dataloader is constructed dataset_config (dict): configuration of the dataset. should be DATA.TRAIN or DATA.TEST settings num_dataloader_workers (int): number of workers per gpu (or cpu) training pin_memory (bool): whether to pin memory or not multi_processing_method (str): method to use. options: forkserver | fork | spawn sampler_seed (int): seed for the sampler. Should be identical per process device (torch.device): training on cuda or cpu get_sampler (get_sampler): function that is used to get the sampler worker_init_fn (None): any function that should be executed during initialization of dataloader workers Returns: Instance of Pytorch DataLoader. The dataloader is wrapped with DataloaderAsyncGPUWrapper or DataloaderSyncGPUWrapper depending on whether user wants to copy data to gpu async or not. """ # pytorch dataloader requires setting the multiprocessing type. setup_multiprocessing_method(multi_processing_method) # we don't need to set the rank, replicas as the Sampler already does so in # it's init function data_sampler = get_sampler(dataset, dataset_config, sampler_seed) collate_function = get_collator(dataset_config["COLLATE_FUNCTION"], dataset_config["COLLATE_FUNCTION_PARAMS"]) # Replace the worker_init_fn with a deterministic one when debugging if dataset_config["USE_DEBUGGING_SAMPLER"]: worker_init_fn = debugging_worker_init_fn # Create the pytorch dataloader dataloader = DataLoader( dataset=dataset, num_workers=num_dataloader_workers, pin_memory=pin_memory, shuffle=False, batch_size=dataset_config["BATCHSIZE_PER_REPLICA"], collate_fn=collate_function, sampler=data_sampler, drop_last=dataset_config["DROP_LAST"], worker_init_fn=worker_init_fn, ) # If the targeted device is CUDA, set up async device copy: # - makes sure that samples are on device # - overlap the copy with the previous batch computation. if device.type == "cuda": if dataset.cfg["DATA"]["ENABLE_ASYNC_GPU_COPY"]: logging.info( "Wrapping the dataloader to async device copies") # NOQA dataloader = DataloaderAsyncGPUWrapper(dataloader) else: logging.info( "Wrapping the dataloader to synchronous device copies") # NOQA dataloader = DataloaderSyncGPUWrapper(dataloader) else: logging.warning("Selecting a CPU device") return dataloader
def extract_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") for layer_name, layer_features in features[split].items(): out_feat_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_features.npy") out_target_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_targets.npy") out_inds_file = os.path.join( checkpoint_folder, f"rank{dist_rank}_{split}_{layer_name}_inds.npy") feat_shape = layer_features["features"].shape logging.info( f"Saving extracted features of {layer_name} with shape {feat_shape} to: {out_feat_file}" ) save_file(layer_features["features"], out_feat_file) logging.info( f"Saving extracted targets of {layer_name} to: {out_target_file}" ) save_file(layer_features["targets"], out_target_file) logging.info( f"Saving extracted indices of {layer_name} to: {out_inds_file}" ) save_file(layer_features["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_label_predictions_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes label predictions workflow per machine. Runs the model in eval mode only to extract the label predicted per class. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant for the feature extraction. dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) # setup the multiprocessing to be forkserver. See https://fb.quip.com/CphdAGUaM5Wf logging.info( f"Setting multiprocessing method: {cfg.MULTI_PROCESSING_METHOD}") setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings # print the environment info for the current node logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) print_cfg(cfg) logging.info(f"System config:\n{collect_env_info()}") # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_label_hook_generator(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=False, extract_predictions=True, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_main(cfg: AttrDict, dist_run_id: str, local_rank: int = 0, node_id: int = 0): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup logging setup_logging(__name__) # setup the environment variables set_env_vars(local_rank, node_id, cfg) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg) # print the training settings and system settings local_rank, _ = get_machine_local_and_dist_rank() if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) output_dir = get_checkpoint_folder(cfg) trainer = SelfSupervisionTrainer(cfg, dist_run_id) features = trainer.extract() for split in features.keys(): logging.info(f"============== Split: {split} =======================") layers = features[split].keys() for layer in layers: out_feat_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy") out_target_file = ( f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy") out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy" logging.info("Saving extracted features: {} {} to: {}".format( layer, features[split][layer]["features"].shape, out_feat_file)) save_file(features[split][layer]["features"], out_feat_file) logging.info("Saving extracted targets: {} to: {}".format( features[split][layer]["targets"].shape, out_target_file)) save_file(features[split][layer]["targets"], out_target_file) logging.info("Saving extracted indices: {} to: {}".format( features[split][layer]["inds"].shape, out_inds_file)) save_file(features[split][layer]["inds"], out_inds_file) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def build_dataloader( dataset: GenericSSLDataset, dataset_config: dict, num_dataloader_workers: int, pin_memory: bool, multi_processing_method: str, device: torch.device, sampler_seed=0, get_sampler=get_sampler, worker_init_fn=set_dataloader_seeds, **kwargs, ): """ Get the dataloader for the given satasets and data split Args: dataset (GenericSSLDataset): the dataset object for which dataloader is constructed dataset_config (dict): configuration of the dataset. should be DATA.TRAIN or DATA.TEST settings num_dataloader_workers (int): number of workers per gpu (or cpu) training pin_memory (bool): whether to pin memory or not multi_processing_method (str): method to use. options: forkserver | fork | spawn sampler_seed (int): seed for the sampler. Should be identical per process device (torch.device): training on cuda or cpu get_sampler (get_sampler): function that is used to get the sampler worker_init_fn (None): any function that should be executed during initialization of dataloader workers Returns: Instance of Pytorch DataLoader. The dataloader is wrapped with DataloaderAsyncGPUWrapper or DataloaderSyncGPUWrapper depending on whether user wants to copy data to gpu async or not. """ # pytorch dataloader requires setting the multiprocessing type. setup_multiprocessing_method(multi_processing_method) # we don't need to set the rank, replicas as the Sampler already does so in # it's init function data_sampler = get_sampler(dataset, dataset_config, sampler_seed) collate_function = get_collator(dataset_config["COLLATE_FUNCTION"], dataset_config["COLLATE_FUNCTION_PARAMS"]) # Replace the worker_init_fn with a deterministic one when debugging if dataset_config["USE_DEBUGGING_SAMPLER"]: worker_init_fn = debugging_worker_init_fn # Load the labels of the dataset before creating the data loader # or else the load of files will happen on each data loader separately # decreasing performance / hitting quota on data source dataset.load_labels() # Create the pytorch dataloader dataloader = DataLoader( dataset=dataset, num_workers=num_dataloader_workers, pin_memory=pin_memory, shuffle=False, batch_size=dataset_config["BATCHSIZE_PER_REPLICA"], collate_fn=collate_function, sampler=data_sampler, drop_last=dataset_config["DROP_LAST"], worker_init_fn=worker_init_fn, ) enable_async_gpu_copy = dataset.cfg["DATA"]["ENABLE_ASYNC_GPU_COPY"] dataloader = wrap_dataloader(dataloader, enable_async_gpu_copy, device) return dataloader
def train_main( cfg: AttrDict, dist_run_id: str, checkpoint_path: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, hook_generator: Callable[[Any], List[ClassyHook]] = default_hook_generator, ): """ Sets up and executes training workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_path (str): if the training is being resumed from a checkpoint, path to the checkpoint. The tools/run_distributed_engines.py automatically looks for the checkpoint in the checkpoint directory. checkpoint_folder (str): what directory to use for checkpointing. The tools/run_distributed_engines.py creates the directory based on user input in the yaml config file. local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu hook_generator (Callable): The utility function that prepares all the hoooks that will be used in training based on user selection. Some basic hooks are used by default. """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # get the hooks - these hooks are executed per replica hooks = hook_generator(cfg) # build the SSL trainer. The trainer first prepares a "task" object which # acts as a container for various things needed in a training: datasets, # dataloader, optimizers, losses, hooks, etc. "Task" will also have information # about phases (train, test) both. The trainer then sets up distributed # training. trainer = SelfSupervisionTrainer( cfg, dist_run_id, checkpoint_path, checkpoint_folder, hooks ) trainer.train() logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()
def extract_features_main( cfg: AttrDict, dist_run_id: str, checkpoint_folder: str, local_rank: int = 0, node_id: int = 0, ): """ Sets up and executes feature extraction workflow per machine. Args: cfg (AttrDict): user specified input config that has optimizer, loss, meters etc settings relevant to the training dist_run_id (str): For multi-gpu training with PyTorch, we have to specify how the gpus are going to rendezvous. This requires specifying the communication method: file, tcp and the unique rendezvous run_id that is specific to 1 run. We recommend: 1) for 1node: use init_method=tcp and run_id=auto 2) for multi-node, use init_method=tcp and specify run_id={master_node}:{port} checkpoint_folder (str): what directory to use for checkpointing. This folder will be used to output the extracted features as well in case config.EXTRACT_FEATURES.OUTPUT_DIR is not set local_rank (int): id of the current device on the machine. If using gpus, local_rank = gpu number on the current machine node_id (int): id of the current machine. starts from 0. valid for multi-gpu """ # setup the environment variables set_env_vars(local_rank, node_id, cfg) dist_rank = int(os.environ["RANK"]) # setup logging setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank) logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}") # print the environment info for the current node if local_rank == 0: current_env = os.environ.copy() print_system_env_info(current_env) # setup the multiprocessing to be forkserver. # See https://fb.quip.com/CphdAGUaM5Wf setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD) # set seeds logging.info("Setting seed....") set_seeds(cfg, dist_rank) # We set the CUDA device here as well as a safe solution for all downstream # `torch.cuda.current_device()` calls to return correct device. if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available(): local_rank, _ = get_machine_local_and_dist_rank() torch.cuda.set_device(local_rank) # print the training settings and system settings if local_rank == 0: print_cfg(cfg) logging.info("System config:\n{}".format(collect_env_info())) # Identify the hooks to run for the extract label engine # TODO - we need to plug this better with the engine registry # - we either need to use the global hooks registry # - or we need to create specific hook registry by engine hooks = extract_features_hook_generator(cfg) # Run the label prediction extraction trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks) output_dir = cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder trainer.extract( output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder, extract_features=True, extract_predictions=False, ) # TODO (prigoyal): merge this function with _extract_features if dist_rank == 0 and cfg.EXTRACT_FEATURES.MAP_FEATURES_TO_IMG_NAME: # Get the names of the features that we extracted features for. If user doesn't # specify the features to evaluate, we get the full model output and freeze # head/trunk both as caution. layers = get_trunk_output_feature_names(cfg.MODEL) if len(layers) == 0: layers = ["heads"] available_splits = [ item.lower() for item in trainer.task.available_splits ] for split in available_splits: image_paths = trainer.task.datasets[split].get_image_paths()[0] for layer in layers: ExtractedFeaturesLoader.map_features_to_img_filepath( image_paths=image_paths, input_dir=output_dir, split=split, layer=layer, ) logging.info("All Done!") # close the logging streams including the filehandlers shutdown_logging()