def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = TFPolicy( 0, mock_behavior_specs, trainer_config, "test", False, tanhresample, tanhresample, ) with policy.graph.as_default(): bc_module = BCModule( policy, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, settings=bc_settings, ) policy.initialize_or_load( ) # Normally the optimizer calls this after the BCModule is created return bc_module
def _load_graph(self, policy: TFPolicy, model_path: str, reset_global_steps: bool = False) -> None: # This prevents normalizer init up from executing on load policy.first_normalization_update = False with policy.graph.as_default(): logger.info(f"Loading model from {model_path}.") ckpt = tf.train.get_checkpoint_state(model_path) if ckpt is None: raise UnityPolicyException( "The model {} could not be loaded. Make " "sure you specified the right " "--run-id and that the previous run you are loading from had the same " "behavior names.".format(model_path)) if self.tf_saver: try: self.tf_saver.restore(policy.sess, ckpt.model_checkpoint_path) except tf.errors.NotFoundError: raise UnityPolicyException( "The model {} was found but could not be loaded. Make " "sure the model is from the same version of ML-Agents, has the same behavior parameters, " "and is using the same trainer configuration as the current run." .format(model_path)) self._check_model_version(__version__) if reset_global_steps: policy.set_step(0) logger.info( "Starting training from step 0 and saving to {}.".format( self.model_path)) else: logger.info( f"Resuming training from step {policy.get_current_step()}." )
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. The first policy encountered sets the wrapped trainer team. This is to ensure that all agents from the same multi-agent team are grouped. All policies associated with this team are added to the wrapped trainer to be trained. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ name_behavior_id = parsed_behavior_id.behavior_id team_id = parsed_behavior_id.team_id self.controller.subscribe_team_id(team_id, self) self.policies[name_behavior_id] = policy policy.create_tf_graph() self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id # for saving/swapping snapshots policy.init_load_weights() # First policy or a new agent on the same team encountered if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team: self.current_policy_snapshot[ parsed_behavior_id.brain_name] = policy.get_weights() self._save_snapshot( ) # Need to save after trainer initializes policy self.trainer.add_policy(parsed_behavior_id, policy) self._learning_team = self.controller.get_learning_team self.wrapped_trainer_team = team_id
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. For the first policy added, add a trainer to the policy and set the learning behavior name to name_behavior_id. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ self.policies[name_behavior_id] = policy policy.create_tf_graph() # First policy encountered if not self.learning_behavior_name: weights = policy.get_weights() self.current_policy_snapshot = weights self.trainer.add_policy(name_behavior_id, policy) self._save_snapshot( policy) # Need to save after trainer initializes policy self.learning_behavior_name = name_behavior_id behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id( self.learning_behavior_name) team_id = behavior_id_parsed.behavior_ids["team"] self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM, team_id) else: # for saving/swapping snapshots policy.init_load_weights()
def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id)) run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id)) np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
def test_normalizer_after_load(tmp_path): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True)) policy = TFPolicy(0, behavior_spec, trainer_params) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 6 assert mean[0] == 0.5 assert variance[0] / steps == pytest.approx(0.25, abs=0.01) # Save ckpt and load into another policy path1 = os.path.join(tmp_path, "runid1") model_saver = TFModelSaver(trainer_params, path1) model_saver.register(policy) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 6) assert len(os.listdir(tmp_path)) > 0 policy1 = TFPolicy(0, behavior_spec, trainer_params) model_saver = TFModelSaver(trainer_params, path1, load=True) model_saver.register(policy1) model_saver.initialize_or_load(policy1) # Make another update to new policy, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) trajectory_buffer = trajectory.to_agentbuffer() policy1.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy1.sess.run( [policy1.normalization_steps, policy1.running_mean, policy1.running_variance] ) assert steps == 16 assert mean[0] == 0.8125 assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, model_path: str = "", load: bool = False, seed: int = 0, ) -> TFPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None ) policy = TFPolicy( seed, mock_spec, trainer_settings, model_path=model_path, load=load ) return policy
def create_policy( self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec ) -> TFPolicy: policy = TFPolicy( self.seed, behavior_spec, self.trainer_settings, self.artifact_path, self.load, tanh_squash=True, reparameterize=True, create_tf_graph=False, ) # Load the replay buffer if load if self.load and self.checkpoint_replay_buffer: try: self.load_replay_buffer() except (AttributeError, FileNotFoundError): logger.warning( "Replay buffer was unable to load, starting from scratch." ) logger.debug( "Loaded update buffer with {} sequences".format( self.update_buffer.num_experiences ) ) return policy
def add_policy( self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy ) -> None: """ Adds policy to trainer. """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format( self.__class__.__name__ ) ) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update) )
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = trainer_config trainer_settings.reward_signals = reward_signal_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TFPolicy(0, mock_specs, trainer_settings, "test", False, create_tf_graph=False) if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: optimizer = PPOOptimizer(policy, trainer_settings) optimizer.policy.initialize() return optimizer
def _save_snapshot(self, policy: TFPolicy) -> None: weights = policy.get_weights() try: self.policy_snapshots[self.snapshot_counter] = weights except IndexError: self.policy_snapshots.append(weights) self.policy_elos[self.snapshot_counter] = self.current_elo self.snapshot_counter = (self.snapshot_counter + 1) % self.window
def initialize_or_load(self, policy: Optional[TFPolicy] = None) -> None: # If there is an initialize path, load from that. Else, load from the set model path. # If load is set to True, don't reset steps to 0. Else, do. This allows a user to, # e.g., resume from an initialize path. if policy is None: policy = self.policy policy = cast(TFPolicy, policy) reset_steps = not self.load if self.initialize_path is not None: self._load_graph(policy, self.initialize_path, reset_global_steps=reset_steps) elif self.load: self._load_graph(policy, self.model_path, reset_global_steps=reset_steps) else: policy.initialize() TFPolicy.broadcast_global_variables(0)
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. For the first policy added, add a trainer to the policy and set the learning behavior name to name_behavior_id. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ self.policies[name_behavior_id] = policy policy.create_tf_graph() # First policy encountered if not self.learning_behavior_name: weights = policy.get_weights() self.current_policy_snapshot = weights self.trainer.add_policy(name_behavior_id, policy) self._save_snapshot(policy) # Need to save after trainer initializes policy self.learning_behavior_name = name_behavior_id else: # for saving/swapping snapshots policy.init_load_weights()
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TFPolicy(0, mock_brain, trainer_settings, "test", False, create_tf_graph=False) optimizer = SACOptimizer(policy, trainer_settings) policy.initialize() return optimizer
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW) trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = TFPolicy( 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False ) optimizer = PPOOptimizer(policy, trainer_settings) policy.initialize() return optimizer
def test_normalization(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), "testdir", False, ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def _check_model_version(self, version: str) -> None: """ Checks whether the model being loaded was created with the same version of ML-Agents, and throw a warning if not so. """ if self.policy is not None and self.policy.version_tensors is not None: loaded_ver = tuple( num.eval(session=self.sess) for num in self.policy.version_tensors) if loaded_ver != TFPolicy._convert_version_string(version): logger.warning( f"The model checkpoint you are loading from was saved with ML-Agents version " f"{loaded_ver[0]}.{loaded_ver[1]}.{loaded_ver[2]} but your current ML-Agents" f"version is {version}. Model may not behave properly.")
def create_policy(self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec) -> TFPolicy: """ Creates a PPO policy to trainers list of policies. :param behavior_spec: specifications for policy construction :return policy """ policy = TFPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO create_tf_graph= False, # We will create the TF graph in the Optimizer ) return policy
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. :param parsed_behavior_id: Behavior identifiers that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) self.policy = policy self.policies[parsed_behavior_id.behavior_id] = policy self.optimizer = PPOOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-NNPolicy passed to PPOTrainer.add_policy()") self.policy = policy self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def create_tf_policy( self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec, create_graph: bool = False, ) -> TFPolicy: """ Creates a policy with a Tensorflow backend and PPO hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :param create_graph: whether to create the Tensorflow graph on construction :return policy """ policy = TFPolicy( self.seed, behavior_spec, self.trainer_settings, condition_sigma_on_obs=False, # Faster training for PPO create_tf_graph=create_graph, ) return policy
def create_tf_policy( self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec, create_graph: bool = False, ) -> TFPolicy: """ Creates a policy with a Tensorflow backend and SAC hyperparameters :param parsed_behavior_id: :param behavior_spec: specifications for policy construction :param create_graph: whether to create the Tensorflow graph on construction :return policy """ policy = TFPolicy( self.seed, behavior_spec, self.trainer_settings, tanh_squash=True, reparameterize=True, create_tf_graph=create_graph, ) self.maybe_load_replay_buffer() return policy
def add_policy(self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ train adversarial games.".format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_settings) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() # Assume steps were updated at the correct ratio before self.update_steps = int(max(1, self.step / self.steps_per_update)) self.reward_signal_update_steps = int( max(1, self.step / self.reward_signal_steps_per_update))
def test_large_normalization(): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) # Taken from Walker seed 3713 which causes NaN without proper initialization large_obs1 = [ 1800.00036621, 1799.96972656, 1800.01245117, 1800.07214355, 1800.02758789, 1799.98303223, 1799.88647461, 1799.89575195, 1800.03479004, 1800.14025879, 1800.17675781, 1800.20581055, 1800.33740234, 1800.36450195, 1800.43457031, 1800.45544434, 1800.44604492, 1800.56713867, 1800.73901367, ] large_obs2 = [ 1799.99975586, 1799.96679688, 1799.92980957, 1799.89550781, 1799.93774414, 1799.95300293, 1799.94067383, 1799.92993164, 1799.84057617, 1799.69873047, 1799.70605469, 1799.82849121, 1799.85095215, 1799.76977539, 1799.78283691, 1799.76708984, 1799.67163086, 1799.59191895, 1799.5135498, 1799.45556641, 1799.3717041, ] policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), "testdir", False, ) time_horizon = len(large_obs1) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01) assert variance[0] / steps == pytest.approx( np.var(large_obs1, dtype=np.float32), abs=0.01 ) time_horizon = len(large_obs2) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx( np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 ) assert variance[0] / steps == pytest.approx( np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 )
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) lr_schedule = LearningRateSchedule( trainer_params.get("learning_rate_schedule", "linear")) h_size = int(trainer_params["hidden_units"]) epsilon = float(trainer_params["epsilon"]) beta = float(trainer_params["beta"]) max_step = float(trainer_params["max_steps"]) num_layers = int(trainer_params["num_layers"]) vis_encode_type = EncoderType( trainer_params.get("vis_encode_type", "simple")) self.burn_in_ratio = float( trainer_params.get("burn_in_ratio", 0.0)) self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_learning_rate( lr_schedule, lr, self.policy.global_step, int(max_step)) self._create_losses( self.policy.total_log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, }) self.policy.initialize_or_load()
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: Brain parameters used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope(""): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast( SACSettings, trainer_params.hyperparameters) lr = hyperparameters.learning_rate lr_schedule = hyperparameters.learning_rate_schedule max_step = trainer_params.max_steps self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy self.act_size = policy.act_size policy_network_settings = policy.network_settings h_size = policy_network_settings.hidden_units num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = ( 0.2 # Roughly equal to e-greedy 0.05 ) self.continuous_target_entropy_scale = 1.0 stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( policy=self.policy, m_size=self.policy.m_size, # 3x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( policy=self.policy, m_size=self.policy.m_size, # 1x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) self.m_size = 3 * self.policy.m_size self._create_inputs_and_outputs() self.learning_rate = ModelUtils.create_schedule( lr_schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, int(max_step), stream_names, discrete=not self.policy.use_continuous_act, ) self._create_sac_optimizer_ops() self.selected_actions = (self.policy.selected_actions ) # For GAIL and other reward signals if self.policy.normalize: target_update_norm = self.target_network.copy_normalization( self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) # Update the normalization of the optimizer when the policy does. self.policy.update_normalization_op = tf.group([ self.policy.update_normalization_op, target_update_norm ]) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Losses/Q1 Loss": "q1_loss", "Losses/Q2 Loss": "q2_loss", "Policy/Entropy Coeff": "entropy_coef", "Policy/Learning Rate": "learning_rate", } self.update_dict = { "value_loss": self.total_value_loss, "policy_loss": self.policy_loss, "q1_loss": self.q1_loss, "q2_loss": self.q2_loss, "entropy_coef": self.ent_coef, "update_batch": self.update_batch_policy, "update_value": self.update_batch_value, "update_entropy": self.update_batch_entropy, "learning_rate": self.learning_rate, }
def test_convert_version_string(): result = TFPolicy._convert_version_string("200.300.100") assert result == (200, 300, 100) # Test dev versions result = TFPolicy._convert_version_string("200.300.100.dev0") assert result == (200, 300, 100)
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) hyperparameters: PPOSettings = cast( PPOSettings, trainer_params.hyperparameters) lr = float(hyperparameters.learning_rate) self._schedule = hyperparameters.learning_rate_schedule epsilon = float(hyperparameters.epsilon) beta = float(hyperparameters.beta) max_step = float(trainer_params.max_steps) policy_network_settings = policy.network_settings h_size = int(policy_network_settings.hidden_units) num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.burn_in_ratio = 0.0 self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer_op: Optional[tf.train.Optimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", "Policy/Epsilon": "decay_epsilon", "Policy/Beta": "decay_beta", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_schedule( self._schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy.total_log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, "decay_epsilon": self.decay_epsilon, "decay_beta": self.decay_beta, })