def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, ): """ Initialized the policy. :param seed: Random seed to use for TensorFlow. :param brain: The corresponding Brain for this policy. :param trainer_settings: The trainer parameters. :param model_path: Where to load/save the model. :param load: If True, load model from model_path. Otherwise, create new model. """ self.m_size = 0 self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings # for ghost trainer save/load snapshots self.assign_phs: List[tf.Tensor] = [] self.assign_ops: List[tf.Operation] = [] self.inference_dict: Dict[str, tf.Tensor] = {} self.update_dict: Dict[str, tf.Tensor] = {} self.sequence_length = 1 self.seed = seed self.behavior_spec = behavior_spec self.act_size = (list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size]) self.vec_obs_size = sum(shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1) self.vis_obs_size = sum(1 for shape in behavior_spec.observation_shapes if len(shape) == 3) self.use_recurrent = self.network_settings.memory is not None self.memory_dict: Dict[str, np.ndarray] = {} self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.normalize = self.network_settings.normalize self.use_continuous_act = behavior_spec.is_action_continuous() self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self.keep_checkpoints = self.trainer_settings.keep_checkpoints self.graph = tf.Graph() self.sess = tf.Session(config=tf_utils.generate_session_config(), graph=self.graph) self.saver: Optional[tf.Operation] = None self.seed = seed if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length self._initialize_tensorflow_references() self.load = load
def __init__( self, seed: int, behavior_spec: BehaviorSpec, trainer_settings: TrainerSettings, model_path: str, load: bool = False, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ): self.behavior_spec = behavior_spec self.trainer_settings = trainer_settings self.network_settings: NetworkSettings = trainer_settings.network_settings self.seed = seed self.act_size = ( list(behavior_spec.discrete_action_branches) if behavior_spec.is_action_discrete() else [behavior_spec.action_size] ) self.vec_obs_size = sum( shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1 ) self.vis_obs_size = sum( 1 for shape in behavior_spec.observation_shapes if len(shape) == 3 ) self.model_path = model_path self.initialize_path = self.trainer_settings.init_path self._keep_checkpoints = self.trainer_settings.keep_checkpoints self.use_continuous_act = behavior_spec.is_action_continuous() self.num_branches = self.behavior_spec.action_size self.previous_action_dict: Dict[str, np.array] = {} self.memory_dict: Dict[str, np.ndarray] = {} self.normalize = trainer_settings.network_settings.normalize self.use_recurrent = self.network_settings.memory is not None self.load = load self.h_size = self.network_settings.hidden_units num_layers = self.network_settings.num_layers if num_layers < 1: num_layers = 1 self.num_layers = num_layers self.vis_encode_type = self.network_settings.vis_encode_type self.tanh_squash = tanh_squash self.reparameterize = reparameterize self.condition_sigma_on_obs = condition_sigma_on_obs self.m_size = 0 self.sequence_length = 1 if self.network_settings.memory is not None: self.m_size = self.network_settings.memory.memory_size self.sequence_length = self.network_settings.memory.sequence_length # Non-exposed parameters; these aren't exposed because they don't have a # good explanation and usually shouldn't be touched. self.log_std_min = -20 self.log_std_max = 2
def create_torch_policy(self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec) -> TorchPolicy: """ Creates a policy with a PyTorch backend and PPO hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :return policy """ policy = TorchPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO separate_critic=behavior_spec.is_action_continuous(), ) return policy