def _configure_session( env: det.EnvContext, hvd_config: horovod.HorovodContext, session_config: tf.compat.v1.ConfigProto, ) -> Optional[tf.compat.v1.Session]: if not tf.executing_eagerly(): session_config.gpu_options.allow_growth = True if hvd_config.use: # We launch a horovod process per GPU. Each process # needs to bind to a unique GPU. session_config.gpu_options.visible_device_list = str( hvd.local_rank()) session = tf.compat.v1.Session( graph=tf.compat.v1.get_default_graph(), config=session_config) tf.compat.v1.keras.backend.set_session(session) return session else: gpus = tf.config.experimental.list_physical_devices("GPU") if len(gpus) > 0: local_rank = hvd.local_rank() if hvd_config.use else 0 gpu = gpus[local_rank] tf.config.set_visible_devices(gpu, "GPU") tf.config.experimental.set_memory_growth(gpu, True) return None
def _init_paths(self) -> None: """ Create a unique model directory for each training process. If a load path is provided, copy the checkpoint into the model directory of each training process. This model directory will be used to initialize an Estimator. We also update the paths in the CheckpointState metadata file to the new directory location. """ # Add suffix so that horovod processes don't overwrite each other. suffix = str(0) if not self.hvd_config.use else str(hvd.local_rank()) if self.load_path is None: self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) logging.debug(f"Estimator directory set to {self.estimator_dir}.") return for callback in self.train_hooks: if isinstance(callback, estimator.RunHook): callback.on_checkpoint_load(str(self.load_path)) self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix)) if self.estimator_dir.exists(): shutil.rmtree(str(self.estimator_dir)) logging.debug( f"Copying from {self.load_path} to {self.estimator_dir}.") shutil.copytree(str(self.load_path), str(self.estimator_dir)) # Calibrate the CheckpointState metadata file to the new location. estimator._update_checkpoint_path_in_state_file(self.estimator_dir) logging.debug(f"Load path set to {self.estimator_dir}.")
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. if env.experiment_config.input_from_dataflow(): logging.debug("Using tensorpack dataflows as input.") process_rank = 0 if not hvd_config.use else hvd.rank() EstimatorTrialController.set_random_seed(env.trial_seed + process_rank) else: # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior()
def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None: super().__init__(*args, **kwargs) # type: ignore self.batch_size = self.context.get_per_slot_batch_size() self.scheduling_unit = self.env.experiment_config.scheduling_unit() logging.debug("Starting LoopTrialController initialization.") if self.hvd_config.use: self.is_chief = hvd.rank() == 0 training_process_rank = hvd.local_rank() else: self.is_chief = True training_process_rank = 0 if self.hvd_config.use and not self.is_chief: log_level = (logging.DEBUG if self.env.experiment_config.debug_enabled() else logging.WARNING) logging.getLogger().setLevel(log_level) logging.debug( f"Training coordination initialized on local rank {training_process_rank}, " f"using hvd: {self.hvd_config.use}.") # Initialize communication directly between training processes. self.train_process_comm_chief = None # type: Optional[ipc.ZMQBroadcastServer] self.train_process_comm_worker = None # type: Optional[ipc.ZMQBroadcastClient] if self.hvd_config.use: self._initialize_train_process_comm()
def _init_session_config( session_config: tf.compat.v1.ConfigProto, env: det.EnvContext, hvd_config: horovod.HorovodContext, ) -> tf.compat.v1.ConfigProto: if session_config is None: session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.allow_growth = True if not hvd_config.use: return session_config if version.parse(tf.__version__) >= version.parse("2.5.0"): gpus = tf.config.experimental.list_physical_devices("GPU") if len(gpus) > 0: local_rank = hvd.local_rank() if hvd_config.use else 0 gpu = gpus[local_rank] tf.config.experimental.set_visible_devices(gpu, "GPU") tf.config.experimental.set_memory_growth(gpu, True) session_config.gpu_options.visible_device_list = str( horovod.hvd.local_rank()) return session_config
def _init_session_config( cls: Type["EstimatorTrialController"], session_config: tf.compat.v1.ConfigProto, env: det.EnvContext, use_horovod: bool = False, ) -> tf.compat.v1.ConfigProto: if session_config is None: session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.allow_growth = True if not use_horovod: return session_config if version.parse(tf.__version__) >= version.parse("2.5.0"): gpus = tf.config.experimental.list_physical_devices("GPU") if len(gpus) > 0: local_rank = hvd.local_rank() if use_horovod else 0 gpu = gpus[local_rank] tf.config.experimental.set_visible_devices(gpu, "GPU") tf.config.experimental.set_memory_growth(gpu, True) session_config.gpu_options.visible_device_list = str(horovod.hvd.local_rank()) return session_config
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config)
def _init_device(self) -> None: self.n_gpus = len(self.env.container_gpus) if self.hvd_config.use: check.gt(self.n_gpus, 0) # We launch a horovod process per GPU. Each process # needs to bind to a unique GPU. self.device = torch.device(hvd.local_rank()) torch.cuda.set_device(self.device) elif self.n_gpus > 0: self.device = torch.device("cuda", 0) else: self.device = torch.device("cpu") check.is_not_none(self.device)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. This default session does not have any effect within the Estimator itself. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config, session_config=None) logging.debug("Applying tf.estimator patches.") @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate") def patch_estimator_eval_on_checkpoint(original, *args, **kwargs): # type: ignore # With a single worker and multiple devices, # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if # `input_fn` or `steps` is None, which causes an error when evaluating the # model function. Apply a monkey-patch to skip the internal function that # ultimately runs the evaluation. logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
def _configure_storage(self) -> None: session_config = None # type: Optional[tf.compat.v1.ConfigProto] if self._hvd_config.use: # For multi-GPU training, we map processes to individual GPUs. TF requires # that for each instantiation of `tf.Session`, the process is mapped # to the same GPU. session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.visible_device_list = str( hvd.local_rank()) scheme = "wss" if self._env.use_tls else "ws" rw_coordinator_url = ( f"{scheme}://{self._env.master_addr}:{self._env.master_port}/ws/data-layer/" ) data_layer_type = self._env.experiment_config.get_data_layer_type() if data_layer_type == StorageTypes.SHARED_FS.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("container_storage_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.LFSConfigurations( storage_dir_path=str(local_cache_path)) self._storage = storage.LFSStorage( storage_config, tensorflow_config=session_config) elif data_layer_type == StorageTypes.S3.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("local_cache_container_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.S3Configurations( bucket=self._env.experiment_config["data_layer"]["bucket"], bucket_directory_path=self._env.experiment_config["data_layer"] ["bucket_directory_path"], url=rw_coordinator_url, local_cache_dir=str(local_cache_path), access_key=self._env.experiment_config["data_layer"].get( "access_key"), secret_key=self._env.experiment_config["data_layer"].get( "secret_key"), endpoint_url=self._env.experiment_config["data_layer"].get( "endpoint_url"), coordinator_cert_file=self._env.master_cert_file, coordinator_cert_name=self._env.master_cert_name, ) self._storage = storage.S3Storage(storage_config, tensorflow_config=session_config) elif data_layer_type == StorageTypes.GCS.value: local_cache_dir_path = self._env.experiment_config[ "data_layer"].get("local_cache_container_path") local_cache_path = init_container_storage_path( configured_storage_path=local_cache_dir_path) storage_config = storage.GCSConfigurations( bucket=self._env.experiment_config["data_layer"]["bucket"], bucket_directory_path=self._env.experiment_config["data_layer"] ["bucket_directory_path"], url=rw_coordinator_url, local_cache_dir=str(local_cache_path), coordinator_cert_file=self._env.master_cert_file, coordinator_cert_name=self._env.master_cert_name, ) self._storage = storage.GCSStorage( storage_config, tensorflow_config=session_config) else: raise AssertionError( "Please select a supported data_layer type. Supported types include: " f"{[i.value for i in StorageTypes]}")