def __init__( self, seed: int, brain: BrainParameters, trainer_params: Dict[str, Any], is_training: bool, load: bool, ): """ Policy for Proximal Policy Optimization Networks. :param seed: Random seed. :param brain: Assigned Brain object. :param trainer_params: Defined training parameters. :param is_training: Whether the model should be trained. :param load: Whether a pre-trained model will be loaded or a new one created. """ super().__init__(seed, brain, trainer_params) reward_signal_configs = trainer_params["reward_signals"] self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", } self.create_model( brain, trainer_params, reward_signal_configs, is_training, load, seed ) self.create_reward_signals(reward_signal_configs) with self.graph.as_default(): self.bc_module: Optional[BCModule] = None # Create pretrainer if needed if "behavioral_cloning" in trainer_params: BCModule.check_config(trainer_params["behavioral_cloning"]) self.bc_module = BCModule( self, policy_learning_rate=trainer_params["learning_rate"], default_batch_size=trainer_params["batch_size"], default_num_epoch=3, **trainer_params["behavioral_cloning"], ) if load: self._load_graph() else: self._initialize_graph()
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): self.sess = policy.sess self.policy = policy self.update_dict: Dict[str, tf.Tensor] = {} self.value_heads: Dict[str, tf.Tensor] = {} self.create_reward_signals(trainer_params["reward_signals"]) self.memory_in: tf.Tensor = None self.memory_out: tf.Tensor = None self.m_size: int = 0 self.bc_module: Optional[BCModule] = None # Create pretrainer if needed if "behavioral_cloning" in trainer_params: BCModule.check_config(trainer_params["behavioral_cloning"]) self.bc_module = BCModule( self.policy, policy_learning_rate=trainer_params["learning_rate"], default_batch_size=trainer_params["batch_size"], default_num_epoch=3, **trainer_params["behavioral_cloning"], )
def __init__(self, seed, brain, trainer_params, is_training, load): """ Policy for Proximal Policy Optimization Networks. :param seed: Random seed. :param brain: Assigned Brain object. :param trainer_params: Defined training parameters. :param is_training: Whether the model should be trained. :param load: Whether a pre-trained model will be loaded or a new one created. """ super().__init__(seed, brain, trainer_params) reward_signal_configs = trainer_params["reward_signals"] self.reward_signals = {} with self.graph.as_default(): self.model = PPOModel( brain, lr=float(trainer_params["learning_rate"]), h_size=int(trainer_params["hidden_units"]), epsilon=float(trainer_params["epsilon"]), beta=float(trainer_params["beta"]), max_step=float(trainer_params["max_steps"]), normalize=trainer_params["normalize"], use_recurrent=trainer_params["use_recurrent"], num_layers=int(trainer_params["num_layers"]), m_size=self.m_size, seed=seed, stream_names=list(reward_signal_configs.keys()), vis_encode_type=EncoderType( trainer_params.get("vis_encode_type", "simple")), ) self.model.create_ppo_optimizer() # Create reward signals for reward_signal, config in reward_signal_configs.items(): self.reward_signals[reward_signal] = create_reward_signal( self, reward_signal, config) # Create pretrainer if needed if "pretraining" in trainer_params: BCModule.check_config(trainer_params["pretraining"]) self.bc_module = BCModule( self, policy_learning_rate=trainer_params["learning_rate"], default_batch_size=trainer_params["batch_size"], default_num_epoch=trainer_params["num_epoch"], **trainer_params["pretraining"], ) else: self.bc_module = None if load: self._load_graph() else: self._initialize_graph() self.inference_dict = { "action": self.model.output, "log_probs": self.model.all_log_probs, "value": self.model.value_heads, "entropy": self.model.entropy, "learning_rate": self.model.learning_rate, } if self.use_continuous_act: self.inference_dict["pre_action"] = self.model.output_pre if self.use_recurrent: self.inference_dict["memory_out"] = self.model.memory_out if (is_training and self.use_vec_obs and trainer_params["normalize"] and not load): self.inference_dict[ "update_mean"] = self.model.update_normalization self.total_policy_loss = self.model.policy_loss self.update_dict = { "value_loss": self.model.value_loss, "policy_loss": self.total_policy_loss, "update_batch": self.model.update_batch, }
def __init__( self, seed: int, brain: BrainParameters, trainer_params: Dict[str, Any], is_training: bool, load: bool, ) -> None: """ Policy for Proximal Policy Optimization Networks. :param seed: Random seed. :param brain: Assigned Brain object. :param trainer_params: Defined training parameters. :param is_training: Whether the model should be trained. :param load: Whether a pre-trained model will be loaded or a new one created. """ super().__init__(seed, brain, trainer_params) reward_signal_configs = {} for key, rsignal in trainer_params["reward_signals"].items(): if type(rsignal) is dict: reward_signal_configs[key] = rsignal self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} self.create_model(brain, trainer_params, reward_signal_configs, is_training, load, seed) self.create_reward_signals(reward_signal_configs) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Losses/Q1 Loss": "q1_loss", "Losses/Q2 Loss": "q2_loss", "Policy/Entropy Coeff": "entropy_coef", } with self.graph.as_default(): # Create pretrainer if needed self.bc_module: Optional[BCModule] = None if "pretraining" in trainer_params: BCModule.check_config(trainer_params["pretraining"]) self.bc_module = BCModule( self, policy_learning_rate=trainer_params["learning_rate"], default_batch_size=trainer_params["batch_size"], default_num_epoch=1, samples_per_update=trainer_params["batch_size"], **trainer_params["pretraining"], ) # SAC-specific setting - we don't want to do a whole epoch each update! if "samples_per_update" in trainer_params["pretraining"]: logger.warning( "Pretraining: Samples Per Update is not a valid setting for SAC." ) self.bc_module.samples_per_update = 1 if load: self._load_graph() else: self._initialize_graph() self.sess.run(self.model.target_init_op) # Disable terminal states for certain reward signals to avoid survivor bias for name, reward_signal in self.reward_signals.items(): if not reward_signal.use_terminal_states: self.sess.run(self.model.disable_use_dones[name])