def create_policy(self, brain_parameters: BrainParameters) -> TFPolicy: """ Creates a PPO policy to trainers list of policies. :param brain_parameters: specifications for policy construction :return policy """ if self.multi_gpu and len(get_devices()) > 1: policy: PPOPolicy = MultiGpuPPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) else: policy = PPOPolicy( self.seed, brain_parameters, self.trainer_parameters, self.is_training, self.load, ) for _reward_signal in policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) return policy
def __init__( self, brain, reward_buff_cap, trainer_parameters, training, load, seed, run_id, multi_gpu, ): """ Responsible for collecting experiences and training PPO model. :param trainer_parameters: The parameters for the trainer (dictionary). :param reward_buff_cap: Max reward history to track in the reward buffer :param training: Whether the trainer is set for training. :param load: Whether the model should be loaded. :param seed: The seed the model will be initialized with :param run_id: The identifier of the current run """ super(PPOTrainer, self).__init__(brain, trainer_parameters, training, run_id, reward_buff_cap) self.param_keys = [ "batch_size", "beta", "buffer_size", "epsilon", "hidden_units", "lambd", "learning_rate", "max_steps", "normalize", "num_epoch", "num_layers", "time_horizon", "sequence_length", "summary_freq", "use_recurrent", "summary_path", "memory_size", "model_path", "reward_signals", ] self.check_param_keys() if multi_gpu and len(get_devices()) > 1: self.ppo_policy = MultiGpuPPOPolicy(seed, brain, trainer_parameters, self.is_training, load) else: self.ppo_policy = PPOPolicy(seed, brain, trainer_parameters, self.is_training, load) self.policy = self.ppo_policy for _reward_signal in self.policy.reward_signals.keys(): self.collected_rewards[_reward_signal] = {}