def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE") os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. if env.experiment_config.input_from_dataflow(): logging.debug("Using tensorpack dataflows as input.") process_rank = 0 if not hvd_config.use else hvd.rank() EstimatorTrialController.set_random_seed(env.trial_seed + process_rank) else: # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior()
def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. E.g., If users instantiate their training dataset within ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)`` prior to passing it into ``tf.estimator.TrainSpec``. Args: dataset: tf.data.Dataset shard_dataset: When performing multi-slot (distributed) training, this controls whether the dataset is sharded so that each training process (one per slot) sees unique data. If set to False, users must manually configure each process to use unique data. """ if not self.env.training: return dataset hvd.require_horovod_type("tensorflow", "EstimatorContext.wrap_dataset was called.") self.dataset_initialized = True if not self.hvd_config.use or self.input_from_dataflow or not shard_dataset: if self.hvd_config and not shard_dataset: logging.info("Dataset sharding skipped.") return dataset dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for validation), users should wrap each dataset independently. Args: dataset: tf.data.Dataset shard_dataset: When performing multi-slot (distributed) training, this controls whether the dataset is sharded so that each training process (one per slot) sees unique data. If set to False, users must manually configure each process to use unique data. """ if not self.env.managed_training: return dataset self.dataset_initialized = True if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset) or not shard_dataset: if self.hvd_config and not shard_dataset: logging.info("Dataset sharding skipped.") return dataset hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.") dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug(f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None: super().__init__(*args, **kwargs) # type: ignore self.batch_size = self.context.get_per_slot_batch_size() self.scheduling_unit = self.env.experiment_config.scheduling_unit() logging.debug("Starting LoopTrialController initialization.") if self.hvd_config.use: self.is_chief = hvd.rank() == 0 training_process_rank = hvd.local_rank() else: self.is_chief = True training_process_rank = 0 if self.hvd_config.use and not self.is_chief: log_level = (logging.DEBUG if self.env.experiment_config.debug_enabled() else logging.WARNING) logging.getLogger().setLevel(log_level) logging.debug( f"Training coordination initialized on local rank {training_process_rank}, " f"using hvd: {self.hvd_config.use}.") # Initialize communication directly between training processes. self.train_process_comm_chief = None # type: Optional[ipc.ZMQBroadcastServer] self.train_process_comm_worker = None # type: Optional[ipc.ZMQBroadcastClient] if self.hvd_config.use: self._initialize_train_process_comm()
def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None: super().__init__(*args, **kwargs) # type: ignore self.batch_size = self.context.get_per_slot_batch_size() self.scheduling_unit = self.env.experiment_config.scheduling_unit() logging.debug("Starting LoopTrialController initialization.") if self.hvd_config.use: self.is_chief = hvd.rank() == 0 rank = hvd.rank() else: self.is_chief = True rank = 0 if self.hvd_config.use and not self.is_chief: log_level = (logging.DEBUG if self.env.experiment_config.debug_enabled() else logging.WARNING) logging.getLogger().setLevel(log_level) logging.debug( f"TrialController initialized on rank {rank}, using hvd: {self.hvd_config.use}." )
def _set_data_loaders(self) -> None: skip_batches = (self.env.first_step() - 1) * self.batches_per_step nreplicas = hvd.size() if self.hvd_config.use else 1 rank = hvd.rank() if self.hvd_config.use else 0 self.training_loader = self.trial.build_training_data_loader( ).get_data_loader(repeat=True, skip=skip_batches, num_replicas=nreplicas, rank=rank) validation_dataset = self.trial.build_validation_data_loader() if self._evaluate_batch_defined(): self.validation_loader = validation_dataset.get_data_loader( repeat=False, skip=0, num_replicas=nreplicas, rank=rank) elif self.is_chief: self.validation_loader = validation_dataset.get_data_loader( repeat=False, skip=0, num_replicas=1, rank=0)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # Initialize random seeds. if env.experiment_config.input_from_dataflow(): logging.debug("Using tensorpack dataflows as input.") process_rank = 0 if not hvd_config.use else hvd.rank() EstimatorTrialController.set_random_seed(env.trial_seed + process_rank) else: # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior()
def _set_data_loaders(self) -> None: skip_batches = self.env.initial_workload.total_batches_processed nreplicas = hvd.size() if self.hvd_config.use else 1 rank = hvd.rank() if self.hvd_config.use else 0 self.training_loader = self.trial.build_training_data_loader( ).get_data_loader(repeat=True, skip=skip_batches, num_replicas=nreplicas, rank=rank) self.context._epoch_len = len(self.training_loader) validation_dataset = self.trial.build_validation_data_loader() if self._evaluate_batch_defined(): self.validation_loader = validation_dataset.get_data_loader( repeat=False, skip=0, num_replicas=nreplicas, rank=rank) elif self.is_chief: self.validation_loader = validation_dataset.get_data_loader( repeat=False, skip=0, num_replicas=1, rank=0)
def wrap_dataset(self, dataset: Any) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. E.g., If users instantiate their training dataset within ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)`` prior to passing it into ``tf.estimator.TrainSpec``. """ hvd.require_horovod_type("tensorflow", "EstimatorContext.wrap_dataset was called.") self.dataset_initialized = True if not self.hvd_config.use or self.input_from_dataflow: return dataset dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def _set_data_loaders(self) -> None: skip_batches = self.env.initial_workload.total_batches_processed nreplicas = hvd.size() if self.hvd_config.use else 1 rank = hvd.rank() if self.hvd_config.use else 0 # TODO: the number of ways a user could get this wrong is alarming. Right now we don't # have any validation, but we should add some. Maybe deprecate the old way? Or mark the # new way as "advanced"? train_data = self.trial.build_training_data_loader() if isinstance(train_data, pytorch.DataLoader): # Old-API, a user-provided det.pytorch.DataLoader. self.training_loader = train_data.get_data_loader( repeat=True, skip=skip_batches, num_replicas=nreplicas, rank=rank) else: # New-API, assume the user called context.make_training_batch_sampler. self.training_loader = train_data self.context._epoch_len = len(self.training_loader) validation_data = self.trial.build_validation_data_loader() if self._evaluate_batch_defined(): if isinstance(validation_data, pytorch.DataLoader): # Old-API, a user-provided det.pytorch.DataLoader. self.validation_loader = validation_data.get_data_loader( repeat=False, skip=0, num_replicas=nreplicas, rank=rank) else: # New-API, assume the user called context.make_validation_batch_sampler. self.validation_loader = validation_data elif self.is_chief: if isinstance(validation_data, pytorch.DataLoader): # Old-API, a user-provided det.pytorch.DataLoader. self.validation_loader = validation_data.get_data_loader( repeat=False, skip=0, num_replicas=1, rank=0) else: # Oh shit, I hope the user didn't call make_validation_batch_sampler; that would # be bad for them here. self.validation_loader = validation_data
def wrap_dataset(self, dataset: Any) -> Any: """ This should be used to wrap ``tf.data.Dataset`` objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their dataset. If users create multiple datasets (e.g., one for training and one for testing), users should wrap each dataset independently. Args: dataset: tf.data.Dataset """ self.dataset_initialized = True if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset): return dataset hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.") dataset = dataset.shard(hvd.size(), hvd.rank()) logging.debug( f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.") return dataset
def _init_shard(self) -> None: if not self._hvd_config.use: return self._shard_rank = hvd.rank() self._num_shards = hvd.size()