Exemple #1
0
    def __init__(self,
                 observation_space: gym.spaces.Space,
                 action_space: gym.spaces.Space,
                 config: TrainerConfigDict,
                 sess: "tf1.Session",
                 obs_input: TensorType,
                 sampled_action: TensorType,
                 loss: TensorType,
                 loss_inputs: List[Tuple[str, TensorType]],
                 model: ModelV2 = None,
                 sampled_action_logp: Optional[TensorType] = None,
                 action_input: Optional[TensorType] = None,
                 log_likelihood: Optional[TensorType] = None,
                 dist_inputs: Optional[TensorType] = None,
                 dist_class: Optional[type] = None,
                 state_inputs: Optional[List[TensorType]] = None,
                 state_outputs: Optional[List[TensorType]] = None,
                 prev_action_input: Optional[TensorType] = None,
                 prev_reward_input: Optional[TensorType] = None,
                 seq_lens: Optional[TensorType] = None,
                 max_seq_len: int = 20,
                 batch_divisibility_req: int = 1,
                 update_ops: List[TensorType] = None,
                 explore: Optional[TensorType] = None,
                 timestep: Optional[TensorType] = None):
        """Initializes a Policy object.

        Args:
            observation_space (gym.spaces.Space): Observation space of the env.
            action_space (gym.spaces.Space): Action space of the env.
            config (TrainerConfigDict): The Policy config dict.
            sess (tf1.Session): The TensorFlow session to use.
            obs_input (TensorType): Input placeholder for observations, of
                shape [BATCH_SIZE, obs...].
            sampled_action (TensorType): Tensor for sampling an action, of
                shape [BATCH_SIZE, action...]
            loss (TensorType): Scalar policy loss output tensor.
            loss_inputs (List[Tuple[str, TensorType]]): A (name, placeholder)
                tuple for each loss input argument. Each placeholder name must
                correspond to a SampleBatch column key returned by
                postprocess_trajectory(), and has shape [BATCH_SIZE, data...].
                These keys will be read from postprocessed sample batches and
                fed into the specified placeholders during loss computation.
            model (ModelV2): used to integrate custom losses and
                stats from user-defined RLlib models.
            sampled_action_logp (Optional[TensorType]): log probability of the
                sampled action.
            action_input (Optional[TensorType]): Input placeholder for actions
                for logp/log-likelihood calculations.
            log_likelihood (Optional[TensorType]): Tensor to calculate the
                log_likelihood (given action_input and obs_input).
            dist_class (Optional[type]): An optional ActionDistribution class
                to use for generating a dist object from distribution inputs.
            dist_inputs (Optional[TensorType]): Tensor to calculate the
                distribution inputs/parameters.
            state_inputs (Optional[List[TensorType]]): List of RNN state input
                Tensors.
            state_outputs (Optional[List[TensorType]]): List of RNN state
                output Tensors.
            prev_action_input (Optional[TensorType]): placeholder for previous
                actions.
            prev_reward_input (Optional[TensorType]): placeholder for previous
                rewards.
            seq_lens (Optional[TensorType]): Placeholder for RNN sequence
                lengths, of shape [NUM_SEQUENCES].
                Note that NUM_SEQUENCES << BATCH_SIZE. See
                policy/rnn_sequencing.py for more information.
            max_seq_len (int): Max sequence length for LSTM training.
            batch_divisibility_req (int): pad all agent experiences batches to
                multiples of this value. This only has an effect if not using
                a LSTM model.
            update_ops (List[TensorType]): override the batchnorm update ops
                to run when applying gradients. Otherwise we run all update
                ops found in the current variable scope.
            explore (Optional[Union[TensorType, bool]]): Placeholder for
                `explore` parameter into call to
                Exploration.get_exploration_action. Explicitly set this to
                False for not creating any Exploration component.
            timestep (Optional[TensorType]): Placeholder for the global
                sampling timestep.
        """
        self.framework = "tf"
        super().__init__(observation_space, action_space, config)

        # Get devices to build the graph on.
        worker_idx = self.config.get("worker_index", 0)
        if not config["_fake_gpus"] and \
                ray.worker._mode() == ray.worker.LOCAL_MODE:
            num_gpus = 0
        elif worker_idx == 0:
            num_gpus = config["num_gpus"]
        else:
            num_gpus = config["num_gpus_per_worker"]
        gpu_ids = get_gpu_devices()

        # Place on one or more CPU(s) when either:
        # - Fake GPU mode.
        # - num_gpus=0 (either set by user or we are in local_mode=True).
        # - no GPUs available.
        if config["_fake_gpus"] or num_gpus == 0 or not gpu_ids:
            logger.info("TFPolicy (worker={}) running on {}.".format(
                worker_idx if worker_idx > 0 else "local",
                f"{num_gpus} fake-GPUs" if config["_fake_gpus"] else "CPU"))
            self.devices = [
                "/cpu:0" for _ in range(int(math.ceil(num_gpus)) or 1)
            ]
        # Place on one or more actual GPU(s), when:
        # - num_gpus > 0 (set by user) AND
        # - local_mode=False AND
        # - actual GPUs available AND
        # - non-fake GPU mode.
        else:
            logger.info("TFPolicy (worker={}) running on {} GPU(s).".format(
                worker_idx if worker_idx > 0 else "local", num_gpus))

            # We are a remote worker (WORKER_MODE=1):
            # GPUs should be assigned to us by ray.
            if ray.worker._mode() == ray.worker.WORKER_MODE:
                gpu_ids = ray.get_gpu_ids()

            if len(gpu_ids) < num_gpus:
                raise ValueError(
                    "TFPolicy was not able to find enough GPU IDs! Found "
                    f"{gpu_ids}, but num_gpus={num_gpus}.")

            self.devices = [
                f"/gpu:{i}" for i, _ in enumerate(gpu_ids) if i < num_gpus
            ]

        # Disable env-info placeholder.
        if SampleBatch.INFOS in self.view_requirements:
            self.view_requirements[SampleBatch.INFOS].used_for_training = False
            self.view_requirements[
                SampleBatch.INFOS].used_for_compute_actions = False

        assert model is None or isinstance(model, (ModelV2, tf.keras.Model)), \
            "Model classes for TFPolicy other than `ModelV2|tf.keras.Model` " \
            "not allowed! You passed in {}.".format(model)
        self.model = model
        # Auto-update model's inference view requirements, if recurrent.
        if self.model is not None:
            self._update_model_view_requirements_from_init_state()

        # If `explore` is explicitly set to False, don't create an exploration
        # component.
        self.exploration = self._create_exploration() if explore is not False \
            else None

        self._sess = sess
        self._obs_input = obs_input
        self._prev_action_input = prev_action_input
        self._prev_reward_input = prev_reward_input
        self._sampled_action = sampled_action
        self._is_training = self._get_is_training_placeholder()
        self._is_exploring = explore if explore is not None else \
            tf1.placeholder_with_default(True, (), name="is_exploring")
        self._sampled_action_logp = sampled_action_logp
        self._sampled_action_prob = (tf.math.exp(self._sampled_action_logp)
                                     if self._sampled_action_logp is not None
                                     else None)
        self._action_input = action_input  # For logp calculations.
        self._dist_inputs = dist_inputs
        self.dist_class = dist_class

        self._state_inputs = state_inputs or []
        self._state_outputs = state_outputs or []
        self._seq_lens = seq_lens
        self._max_seq_len = max_seq_len

        if self._state_inputs and self._seq_lens is None:
            raise ValueError(
                "seq_lens tensor must be given if state inputs are defined")

        self._batch_divisibility_req = batch_divisibility_req
        self._update_ops = update_ops
        self._apply_op = None
        self._stats_fetches = {}
        self._timestep = timestep if timestep is not None else \
            tf1.placeholder_with_default(
                tf.zeros((), dtype=tf.int64), (), name="timestep")

        self._optimizer = None
        self._grads_and_vars = None
        self._grads = None
        # Policy tf-variables (weights), whose values to get/set via
        # get_weights/set_weights.
        self._variables = None
        # Local optimizer's tf-variables (e.g. state vars for Adam).
        # Will be stored alongside `self._variables` when checkpointing.
        self._optimizer_variables = None

        # The loss tf-op.
        self._loss = None
        # A batch dict passed into loss function as input.
        self._loss_input_dict = {}
        if loss is not None:
            self._initialize_loss(loss, loss_inputs)

        # The log-likelihood calculator op.
        self._log_likelihood = log_likelihood
        if self._log_likelihood is None and self._dist_inputs is not None and \
                self.dist_class is not None:
            self._log_likelihood = self.dist_class(
                self._dist_inputs, self.model).logp(self._action_input)
Exemple #2
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)

            # Log device and worker index.
            from ray.rllib.evaluation.rollout_worker import get_global_worker
            worker = get_global_worker()
            worker_idx = worker.worker_index if worker else 0
            if get_gpu_devices():
                logger.info(
                    "TF-eager Policy (worker={}) running on GPU.".format(
                        worker_idx if worker_idx > 0 else "local"))
            else:
                logger.info(
                    "TF-eager Policy (worker={}) running on CPU.".format(
                        worker_idx if worker_idx > 0 else "local"))

            self._is_training = False
            self._loss_initialized = False

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)

            # The list of local (tf) optimizers (one per loss term).
            self._optimizers: List[LocalOptimizer] = optimizers
            # Backward compatibility: A user's policy may only support a single
            # loss term and optimizer (no lists).
            self._optimizer: LocalOptimizer = \
                optimizers[0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep = 0