def create_reward_signals(self, reward_signal_configs):
        """
        Create reward signals
        :param reward_signal_configs: Reward signal config.
        """
        with self.graph.as_default():
            with tf.variable_scope(TOWER_SCOPE_NAME, reuse=tf.AUTO_REUSE):
                for device_id, device in enumerate(self.devices):
                    with tf.device(device):
                        reward_tower = {}
                        for reward_signal, config in reward_signal_configs.items(
                        ):
                            reward_tower[reward_signal] = create_reward_signal(
                                self, self.towers[device_id], reward_signal,
                                config)
                            for k, v in reward_tower[
                                    reward_signal].update_dict.items():
                                self.update_dict[k + "_" + str(device_id)] = v
                        self.reward_signal_towers.append(reward_tower)
                for _, reward_tower in self.reward_signal_towers[0].items():
                    for _, update_key in reward_tower.stats_name_to_update_name.items(
                    ):
                        all_reward_signal_stats = tf.stack([
                            self.update_dict[update_key + "_" + str(i)]
                            for i in range(len(self.towers))
                        ])
                        mean_reward_signal_stats = tf.reduce_mean(
                            all_reward_signal_stats, 0)
                        self.update_dict.update(
                            {update_key: mean_reward_signal_stats})

            self.reward_signals = self.reward_signal_towers[0]
Exemple #2
0
 def create_reward_signals(
         self, reward_signal_configs: Dict[RewardSignalType, Any]) -> None:
     """
     Create reward signals
     :param reward_signal_configs: Reward signal config.
     """
     # Create reward signals
     for reward_signal, settings in reward_signal_configs.items():
         # Name reward signals by string in case we have duplicates later
         self.reward_signals[reward_signal.value] = create_reward_signal(
             self.policy, reward_signal, settings)
         self.update_dict.update(
             self.reward_signals[reward_signal.value].update_dict)
Exemple #3
0
 def create_reward_signals(self, reward_signal_configs):
     """
     Create reward signals
     :param reward_signal_configs: Reward signal config.
     """
     self.reward_signals = {}
     with self.graph.as_default():
         # Create reward signals
         for reward_signal, config in reward_signal_configs.items():
             self.reward_signals[reward_signal] = create_reward_signal(
                 self, self.model, reward_signal, config)
             self.update_dict.update(
                 self.reward_signals[reward_signal].update_dict)
Exemple #4
0
 def create_reward_signals(self, reward_signal_configs: Dict[str,
                                                             Any]) -> None:
     """
     Create reward signals
     :param reward_signal_configs: Reward signal config.
     """
     self.reward_signals: Dict[str, RewardSignal] = {}
     with self.graph.as_default():
         # Create reward signals
         for reward_signal, config in reward_signal_configs.items():
             if type(config) is dict:
                 self.reward_signals[reward_signal] = create_reward_signal(
                     self, self.model, reward_signal, config)
Exemple #5
0
 def create_reward_signals(self, reward_signal_configs: Dict[str,
                                                             Any]) -> None:
     """
     Create reward signals
     :param reward_signal_configs: Reward signal config.
     """
     self.reward_signals = {}
     # Create reward signals
     for reward_signal, config in reward_signal_configs.items():
         self.reward_signals[reward_signal] = create_reward_signal(
             self.policy, reward_signal, config)
         self.update_dict.update(
             self.reward_signals[reward_signal].update_dict)
    def __init__(self, seed, brain, trainer_params, is_training, load):
        """
        Policy for Proximal Policy Optimization Networks.
        :param seed: Random seed.
        :param brain: Assigned Brain object.
        :param trainer_params: Defined training parameters.
        :param is_training: Whether the model should be trained.
        :param load: Whether a pre-trained model will be loaded or a new one created.
        """
        super().__init__(seed, brain, trainer_params)

        reward_signal_configs = trainer_params["reward_signals"]

        self.reward_signals = {}
        with self.graph.as_default():
            self.model = PPOModel(
                brain,
                lr=float(trainer_params["learning_rate"]),
                h_size=int(trainer_params["hidden_units"]),
                epsilon=float(trainer_params["epsilon"]),
                beta=float(trainer_params["beta"]),
                max_step=float(trainer_params["max_steps"]),
                normalize=trainer_params["normalize"],
                use_recurrent=trainer_params["use_recurrent"],
                num_layers=int(trainer_params["num_layers"]),
                m_size=self.m_size,
                seed=seed,
                stream_names=list(reward_signal_configs.keys()),
                vis_encode_type=EncoderType(
                    trainer_params.get("vis_encode_type", "simple")),
            )
            self.model.create_ppo_optimizer()

            # Create reward signals
            for reward_signal, config in reward_signal_configs.items():
                self.reward_signals[reward_signal] = create_reward_signal(
                    self, reward_signal, config)

            # Create pretrainer if needed
            if "pretraining" in trainer_params:
                BCModule.check_config(trainer_params["pretraining"])
                self.bc_module = BCModule(
                    self,
                    policy_learning_rate=trainer_params["learning_rate"],
                    default_batch_size=trainer_params["batch_size"],
                    default_num_epoch=trainer_params["num_epoch"],
                    **trainer_params["pretraining"],
                )
            else:
                self.bc_module = None

        if load:
            self._load_graph()
        else:
            self._initialize_graph()

        self.inference_dict = {
            "action": self.model.output,
            "log_probs": self.model.all_log_probs,
            "value": self.model.value_heads,
            "entropy": self.model.entropy,
            "learning_rate": self.model.learning_rate,
        }
        if self.use_continuous_act:
            self.inference_dict["pre_action"] = self.model.output_pre
        if self.use_recurrent:
            self.inference_dict["memory_out"] = self.model.memory_out
        if (is_training and self.use_vec_obs and trainer_params["normalize"]
                and not load):
            self.inference_dict[
                "update_mean"] = self.model.update_normalization

        self.total_policy_loss = self.model.policy_loss

        self.update_dict = {
            "value_loss": self.model.value_loss,
            "policy_loss": self.total_policy_loss,
            "update_batch": self.model.update_batch,
        }