Ejemplo n.º 1
0
    def evaluate_batch(self, mini_batch: Dict[str,
                                              np.array]) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.model.batch_size: len(mini_batch["actions"]),
            self.policy.model.sequence_length: self.policy.sequence_length,
        }
        if self.model.use_vail:
            feed_dict[self.model.use_noise] = [0]

        if self.policy.use_vec_obs:
            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
        if self.policy.model.vis_obs_size > 0:
            for i in range(len(self.policy.model.visual_in)):
                _obs = mini_batch["visual_obs%d" % i]
                feed_dict[self.policy.model.visual_in[i]] = _obs

        if self.policy.use_continuous_act:
            feed_dict[
                self.policy.model.selected_actions] = mini_batch["actions"]
        else:
            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
        feed_dict[self.model.done_policy_holder] = np.array(
            mini_batch["done"]).flatten()
        unscaled_reward = self.policy.sess.run(self.model.intrinsic_reward,
                                               feed_dict=feed_dict)
        scaled_reward = unscaled_reward * float(
            self.has_updated) * self.strength
        return RewardSignalResult(scaled_reward, unscaled_reward)
Ejemplo n.º 2
0
    def evaluate_batch(self, mini_batch: Dict[str,
                                              np.array]) -> RewardSignalResult:
        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.batch_size_ph: len(mini_batch["actions"]),
            self.policy.sequence_length_ph: self.policy.sequence_length,
        }
        if self.policy.use_vec_obs:
            feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
            feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
        if self.policy.vis_obs_size > 0:
            for i in range(len(self.policy.visual_in)):
                _obs = mini_batch["visual_obs%d" % i]
                _next_obs = mini_batch["next_visual_obs%d" % i]
                feed_dict[self.policy.visual_in[i]] = _obs
                feed_dict[self.model.next_visual_in[i]] = _next_obs

        if self.policy.use_continuous_act:
            feed_dict[self.policy.selected_actions] = mini_batch["actions"]
        else:
            feed_dict[self.policy.output] = mini_batch["actions"]
        unscaled_reward = self.policy.sess.run(self.model.intrinsic_reward,
                                               feed_dict=feed_dict)
        scaled_reward = np.clip(
            unscaled_reward * float(self.has_updated) * self.strength, 0, 1)
        return RewardSignalResult(scaled_reward, unscaled_reward)
def test_add_rewards_output(dummy_config):
    brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0",
                         False)
    rewardsout = AllRewardsOutput(
        reward_signals={
            "extrinsic":
            RewardSignalResult(scaled_reward=np.array([1.0, 1.0]),
                               unscaled_reward=np.array([1.0, 1.0]))
        },
        environment=np.array([1.0, 1.0]),
    )
    values = {"extrinsic": np.array([[2.0]])}
    agent_id = "123"
    idx = 0
    # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
    next_idx = 1
    trainer.add_rewards_outputs(
        rewardsout,
        values=values,
        agent_id=agent_id,
        agent_idx=idx,
        agent_next_idx=next_idx,
    )
    assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][
        0] == 2.0
    assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
    def evaluate(self, current_info: BrainInfo,
                 next_info: BrainInfo) -> RewardSignalResult:
        """
        Evaluates the reward for the agents present in current_info given the next_info
        :param current_info: The current BrainInfo.
        :param next_info: The BrainInfo from the next timestep.
        :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
        """
        if len(current_info.agents) == 0:
            return RewardSignalResult([], [])
        mini_batch: Dict[str, np.array] = {}
        # Construct the batch and use evaluate_batch
        mini_batch["actions"] = next_info.previous_vector_actions
        mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
        for i in range(len(current_info.visual_observations)):
            mini_batch["visual_obs%d" %
                       i] = current_info.visual_observations[i]
            mini_batch["next_visual_obs%d" %
                       i] = next_info.visual_observations[i]
        if self.policy.use_vec_obs:
            mini_batch["vector_obs"] = current_info.vector_observations
            mini_batch["next_vector_in"] = next_info.vector_observations

        result = self.evaluate_batch(mini_batch)
        return result
    def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
    ) -> RewardSignalResult:
        if len(current_info.agents) == 0:
            return []

        feed_dict: Dict[tf.Tensor, Any] = {
            self.policy.model.batch_size: len(next_info.vector_observations),
            self.policy.model.sequence_length: 1,
        }
        if self.model.use_vail:
            feed_dict[self.model.use_noise] = [0]

        feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
        feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1])
        if self.policy.use_continuous_act:
            feed_dict[
                self.policy.model.selected_actions
            ] = next_info.previous_vector_actions
        else:
            feed_dict[
                self.policy.model.action_holder
            ] = next_info.previous_vector_actions
        if self.policy.use_recurrent:
            if current_info.memories.shape[1] == 0:
                current_info.memories = self.policy.make_empty_memory(
                    len(current_info.agents)
                )
            feed_dict[self.policy.model.memory_in] = current_info.memories
        unscaled_reward = self.policy.sess.run(
            self.model.intrinsic_reward, feed_dict=feed_dict
        )
        scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
        return RewardSignalResult(scaled_reward, unscaled_reward)
Ejemplo n.º 6
0
 def evaluate(self, current_info: BrainInfo, action: np.array,
              next_info: BrainInfo) -> RewardSignalResult:
     """
     Evaluates the reward for the agents present in current_info given the next_info
     :param current_info: The current BrainInfo.
     :param next_info: The BrainInfo from the next timestep.
     :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
     """
     unscaled_reward = np.array(next_info.rewards)
     scaled_reward = self.strength * unscaled_reward
     return RewardSignalResult(scaled_reward, unscaled_reward)
Ejemplo n.º 7
0
    def evaluate(self, current_info: BrainInfo, action: np.array,
                 next_info: BrainInfo) -> RewardSignalResult:
        if len(current_info.agents) == 0:
            return RewardSignalResult([], [])
        mini_batch: Dict[str, np.array] = {}
        # Construct the batch
        mini_batch["actions"] = action
        mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
        for i, obs in enumerate(current_info.visual_observations):
            mini_batch["visual_obs%d" % i] = obs
        if self.policy.use_vec_obs:
            mini_batch["vector_obs"] = current_info.vector_observations

        result = self.evaluate_batch(mini_batch)
        return result
    def evaluate(
        self, current_info: BrainInfo, next_info: BrainInfo
    ) -> RewardSignalResult:
        """
        Evaluates the reward for the agents present in current_info given the next_info
        :param current_info: The current BrainInfo.
        :param next_info: The BrainInfo from the next timestep.
        :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
        """
        if len(current_info.agents) == 0:
            return []

        feed_dict = {
            self.policy.model.batch_size: len(next_info.vector_observations),
            self.policy.model.sequence_length: 1,
        }
        feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
        if self.policy.use_continuous_act:
            feed_dict[
                self.policy.model.selected_actions
            ] = next_info.previous_vector_actions
        else:
            feed_dict[
                self.policy.model.action_holder
            ] = next_info.previous_vector_actions
        for i in range(self.policy.model.vis_obs_size):
            feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
        if self.policy.use_vec_obs:
            feed_dict[self.model.next_vector_in] = next_info.vector_observations
        if self.policy.use_recurrent:
            if current_info.memories.shape[1] == 0:
                current_info.memories = self.policy.make_empty_memory(
                    len(current_info.agents)
                )
            feed_dict[self.policy.model.memory_in] = current_info.memories
        unscaled_reward = self.policy.sess.run(
            self.model.intrinsic_reward, feed_dict=feed_dict
        )
        scaled_reward = np.clip(
            unscaled_reward * float(self.has_updated) * self.strength, 0, 1
        )
        return RewardSignalResult(scaled_reward, unscaled_reward)
Ejemplo n.º 9
0
def test_add_rewards_output(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0",
                         False)
    rewardsout = AllRewardsOutput(
        reward_signals={
            "extrinsic":
            RewardSignalResult(
                scaled_reward=np.array([1.0, 1.0], dtype=np.float32),
                unscaled_reward=np.array([1.0, 1.0], dtype=np.float32),
            )
        },
        environment=np.array([1.0, 1.0], dtype=np.float32),
    )
    values = {"extrinsic": np.array([[2.0]], dtype=np.float32)}
    agent_id = "123"
    idx = 0
    # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
    next_idx = 1
    trainer.add_rewards_outputs(
        rewardsout,
        values=values,
        agent_id=agent_id,
        agent_idx=idx,
        agent_next_idx=next_idx,
    )
    assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][
        0] == 2.0
    assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
Ejemplo n.º 10
0
 def evaluate_batch(self, mini_batch: Dict[str,
                                           np.array]) -> RewardSignalResult:
     env_rews = np.array(mini_batch["environment_rewards"])
     return RewardSignalResult(self.strength * env_rews, env_rews)
Ejemplo n.º 11
0
 def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
     env_rews = np.array(mini_batch["environment_rewards"],
                         dtype=np.float32)
     return RewardSignalResult(self.strength * env_rews, env_rews)