def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow.keras", "TFKerasTrial is in use.") hvd.init() # Start with a clean graph. tf.compat.v1.reset_default_graph() TFKerasTrialController._set_random_seeds(env.trial_seed) # For the Native API we must configure the Session before running user code. if env.experiment_config.native_enabled(): session_config = tf.compat.v1.ConfigProto( allow_soft_placement=True) TFKerasTrialController._configure_session(env, hvd_config, session_config)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # This is option is available for when TF ignores `gpu_options.visible_device_list`. # TODO (DET-3762): Remove this once it's no longer necessary. if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False): logging.info( "Setting `CUDA_VISIBLE_DEVICES` environment variables " "and disabling NCCL_P2P_DISABLE" ) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) os.environ["NCCL_P2P_DISABLE"] = "1" # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. This default session does not have any effect within the Estimator itself. EstimatorTrialController._set_default_tensorflow_session( env=env, hvd_config=hvd_config, session_config=None ) logging.debug("Applying tf.estimator patches.") @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate") def patch_estimator_eval_on_checkpoint(original, *args, **kwargs): # type: ignore # With a single worker and multiple devices, # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if # `input_fn` or `steps` is None, which causes an error when evaluating the # model function. Apply a monkey-patch to skip the internal function that # ultimately runs the evaluation. logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None: # Initialize the correct horovod. if hvd_config.use: hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # Initialize random seeds. if env.experiment_config.input_from_dataflow(): logging.debug("Using tensorpack dataflows as input.") process_rank = 0 if not hvd_config.use else hvd.rank() EstimatorTrialController.set_random_seed(env.trial_seed + process_rank) else: # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. EstimatorTrialController.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior()
def pre_execute_hook( cls: Type["EstimatorTrialController"], env: det.EnvContext, distributed_backend: det._DistributedBackend, ) -> None: # Initialize the correct horovod. if distributed_backend.use_horovod(): hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.") hvd.init() # Initialize random seeds. # Set identical random seeds on all training processes. # When using horovod, each worker will receive a unique # shard of the dataset. cls.set_random_seed(env.trial_seed) if version.parse(tf.__version__) >= version.parse("2.0.0"): tf.compat.v1.disable_v2_behavior() # Set the default session before importing any user code. If the default session isn't # set and users call TF code that detects GPUs, it would map the processes to all of # the GPUs. We set the default session before importing any user code to prevent this # this problem. This default session does not have any effect within the Estimator itself. cls._set_default_tensorflow_session( env=env, session_config=None, use_horovod=distributed_backend.use_horovod()) logging.debug("Applying tf.estimator patches.") @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate") def patch_estimator_eval_on_checkpoint(original, *args, **kwargs): # type: ignore # With a single worker and multiple devices, # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if # `input_fn` or `steps` is None, which causes an error when evaluating the # model function. Apply a monkey-patch to skip the internal function that # ultimately runs the evaluation. logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)