コード例 #1
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow.keras",
                                     "TFKerasTrial is in use.")
            hvd.init()

        # Start with a clean graph.
        tf.compat.v1.reset_default_graph()

        TFKerasTrialController._set_random_seeds(env.trial_seed)

        # For the Native API we must configure the Session before running user code.
        if env.experiment_config.native_enabled():
            session_config = tf.compat.v1.ConfigProto(
                allow_soft_placement=True)
            TFKerasTrialController._configure_session(env, hvd_config,
                                                      session_config)
コード例 #2
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data", {}).get("set_cuda_visible_devices", False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE"
                )
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem. This default session does not have any effect within the Estimator itself.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config, session_config=None
        )

        logging.debug("Applying tf.estimator patches.")

        @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate, "_evaluate")
        def patch_estimator_eval_on_checkpoint(original, *args, **kwargs):  # type: ignore
            # With a single worker and multiple devices,
            # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if
            # `input_fn` or `steps` is None, which causes an error when evaluating the
            # model function. Apply a monkey-patch to skip the internal function that
            # ultimately runs the evaluation.
            logging.info("Skipping %s(*%s, **%s)", original.__name__, args, kwargs)
コード例 #3
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

        # Initialize random seeds.
        if env.experiment_config.input_from_dataflow():
            logging.debug("Using tensorpack dataflows as input.")
            process_rank = 0 if not hvd_config.use else hvd.rank()
            EstimatorTrialController.set_random_seed(env.trial_seed + process_rank)
        else:
            # Set identical random seeds on all training processes.
            # When using horovod, each worker will receive a unique
            # shard of the dataset.
            EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()
コード例 #4
0
    def pre_execute_hook(
        cls: Type["EstimatorTrialController"],
        env: det.EnvContext,
        distributed_backend: det._DistributedBackend,
    ) -> None:
        # Initialize the correct horovod.
        if distributed_backend.use_horovod():
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        cls.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem. This default session does not have any effect within the Estimator itself.
        cls._set_default_tensorflow_session(
            env=env,
            session_config=None,
            use_horovod=distributed_backend.use_horovod())

        logging.debug("Applying tf.estimator patches.")

        @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate,
                                             "_evaluate")
        def patch_estimator_eval_on_checkpoint(original, *args,
                                               **kwargs):  # type: ignore
            # With a single worker and multiple devices,
            # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if
            # `input_fn` or `steps` is None, which causes an error when evaluating the
            # model function. Apply a monkey-patch to skip the internal function that
            # ultimately runs the evaluation.
            logging.info("Skipping %s(*%s, **%s)", original.__name__, args,
                         kwargs)