예제 #1
0
    def _step(self, time_step, policy_state):
        policy_state = common.reset_state_if_necessary(policy_state,
                                                       self._initial_state,
                                                       time_step.is_first())
        policy_step = common.algorithm_step(
            self._algorithm,
            self._observation_transformer,
            time_step,
            state=policy_state,
            training=self._training,
            greedy_predict=self._greedy_predict)
        action = common.sample_action_distribution(policy_step.action)
        next_time_step = self._env_step(action)
        if self._observers:
            traj = from_transition(time_step,
                                   policy_step._replace(action=action),
                                   next_time_step)
            for observer in self._observers:
                observer(traj)
        if self._exp_observers:
            action_distribution_param = common.get_distribution_params(
                policy_step.action)
            exp = make_experience(
                time_step,
                policy_step._replace(action=action),
                action_distribution=action_distribution_param,
                state=policy_state if self._use_rollout_state else ())
            for observer in self._exp_observers:
                observer(exp)

        return next_time_step, policy_step, action
    def get_training_exps(self):
        """
        Get training experiences from the learning queue

        Returns:
            exp (Experience):
            env_id (tf.tensor): if not None, has the shape of (`num_envs`). Each
                element of `env_ids` indicates which batched env the data come from.
            steps (int): how many environment steps this batch of exps contain
        """
        batch = make_learning_batch(*self._tfq.learn_queue.dequeue_all())
        # convert the batch to the experience format
        exp = make_experience(batch.time_step, batch.policy_step,
                              batch.act_dist_param)
        # make the exp batch major for each environment
        exp = tf.nest.map_structure(lambda e: common.transpose2(e, 1, 2), exp)
        num_envs, unroll_length, env_batch_size \
            = batch.time_step.observation.shape[:3]
        steps = num_envs * unroll_length * env_batch_size
        return exp, batch.env_id, steps