def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"]) result = policy.get_action(brain_info_with_agents) assert result == ActionInfo(None, None, None, None, {})
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) # Doesn't really matter what this is dummy_groupspec = AgentGroupSpec([(1, )], "continuous", 1) no_agent_step = BatchedStepResult.empty(dummy_groupspec) result = policy.get_action(no_agent_step) assert result == ActionInfo.empty()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: # for saving/swapping snapshots policy.init_load_weights() self.policies[name_behavior_id] = policy # First policy encountered if not self.learning_behavior_name: weights = policy.get_weights() self.current_policy_snapshot = weights self._save_snapshot(policy) self.trainer.add_policy(name_behavior_id, policy) self.learning_behavior_name = name_behavior_id
def test_take_action_returns_nones_on_missing_values(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy.evaluate = MagicMock(return_value={}) policy.save_memories = MagicMock() step_with_agents = BatchedStepResult( [], np.array([], dtype=np.float32), np.array([False], dtype=np.bool), np.array([], dtype=np.bool), np.array([0]), None, ) result = policy.get_action(step_with_agents, worker_id=0) assert result == ActionInfo(None, None, {}, [0])
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) brain_info_with_agents = BrainInfo([], [], [], agents=["an-agent-id"], local_done=[False]) result = policy.get_action(brain_info_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out) assert result == expected
def _save_snapshot(self, policy: TFPolicy) -> None: weights = policy.get_weights() try: self.policy_snapshots[self.snapshot_counter] = weights except IndexError: self.policy_snapshots.append(weights) self.policy_elos[self.snapshot_counter] = self.current_elo self.snapshot_counter = (self.snapshot_counter + 1) % self.window
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. For the first policy added, add a trainer to the policy and set the learning behavior name to name_behavior_id. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ self.policies[name_behavior_id] = policy policy.create_tf_graph() # First policy encountered if not self.learning_behavior_name: weights = policy.get_weights() self.current_policy_snapshot = weights self.trainer.add_policy(name_behavior_id, policy) self._save_snapshot(policy) # Need to save after trainer initializes policy self.learning_behavior_name = name_behavior_id else: # for saving/swapping snapshots policy.init_load_weights()
def test_take_action_returns_action_info_when_available(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) policy_eval_out = { "action": np.array([1.0], dtype=np.float32), "memory_out": np.array([[2.5]], dtype=np.float32), "value": np.array([1.1], dtype=np.float32), } policy.evaluate = MagicMock(return_value=policy_eval_out) step_with_agents = BatchedStepResult( [], np.array([], dtype=np.float32), np.array([False], dtype=np.bool), np.array([], dtype=np.bool), np.array([0]), None, ) result = policy.get_action(step_with_agents) expected = ActionInfo(policy_eval_out["action"], policy_eval_out["value"], policy_eval_out, [0]) assert result == expected
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, SACPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.step = policy.get_current_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param brain_parameters: specifications for policy construction """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-SACPolicy passed to SACTrainer.add_policy()") self.policy = policy self.optimizer = SACOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None: """ Adds policy to trainer. :param name_behavior_id: Behavior ID that the policy should belong to. :param policy: Policy to associate with name_behavior_id. """ if self.policy: logger.warning( "add_policy has been called twice. {} is not a multi-agent trainer" .format(self.__class__.__name__)) if not isinstance(policy, NNPolicy): raise RuntimeError( "Non-NNPolicy passed to PPOTrainer.add_policy()") self.policy = policy self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters) for _reward_signal in self.optimizer.reward_signals.keys(): self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) # Needed to resume loads properly self.step = policy.get_current_step() self.next_summary_step = self._get_next_summary_step()
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) lr_schedule = LearningRateSchedule( trainer_params.get("learning_rate_schedule", "linear")) h_size = int(trainer_params["hidden_units"]) epsilon = float(trainer_params["epsilon"]) beta = float(trainer_params["beta"]) max_step = float(trainer_params["max_steps"]) num_layers = int(trainer_params["num_layers"]) vis_encode_type = EncoderType( trainer_params.get("vis_encode_type", "simple")) self.burn_in_ratio = float( trainer_params.get("burn_in_ratio", 0.0)) self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_learning_rate( lr_schedule, lr, self.policy.global_step, int(max_step)) self._create_losses( self.policy.log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, }) self.policy.initialize_or_load()
def test_take_action_returns_empty_with_no_agents(): test_seed = 3 policy = TFPolicy(test_seed, basic_mock_brain(), basic_params()) no_agent_brain_info = BrainInfo([], [], [], agents=[]) result = policy.get_action(no_agent_brain_info) assert result == ActionInfo([], [], None)
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: Brain parameters used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope(""): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) lr_schedule = LearningRateSchedule( trainer_params.get("learning_rate_schedule", "constant")) self.policy = policy self.act_size = self.policy.act_size h_size = int(trainer_params["hidden_units"]) max_step = float(trainer_params["max_steps"]) num_layers = int(trainer_params["num_layers"]) vis_encode_type = EncoderType( trainer_params.get("vis_encode_type", "simple")) self.tau = trainer_params.get("tau", 0.005) self.burn_in_ratio = float( trainer_params.get("burn_in_ratio", 0.0)) # Non-exposed SAC parameters self.discrete_target_entropy_scale = ( 0.2) # Roughly equal to e-greedy 0.05 self.continuous_target_entropy_scale = 1.0 self.init_entcoef = trainer_params.get("init_entcoef", 1.0) stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val["gamma"] for _val in trainer_params["reward_signals"].values() ] self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( policy=self.policy, m_size=self.policy.m_size, # 3x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( policy=self.policy, m_size=self.policy.m_size, # 1x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) self.m_size = 3 * self.policy.m_size self._create_inputs_and_outputs() self.learning_rate = ModelUtils.create_learning_rate( lr_schedule, lr, self.policy.global_step, int(max_step)) self._create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, int(max_step), stream_names, discrete=not self.policy.use_continuous_act, ) self._create_sac_optimizer_ops() self.selected_actions = (self.policy.selected_actions ) # For GAIL and other reward signals if self.policy.normalize: target_update_norm = self.target_network.copy_normalization( self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) # Update the normalization of the optimizer when the policy does. self.policy.update_normalization_op = tf.group([ self.policy.update_normalization_op, target_update_norm ]) self.policy.initialize_or_load() self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Losses/Q1 Loss": "q1_loss", "Losses/Q2 Loss": "q2_loss", "Policy/Entropy Coeff": "entropy_coef", "Policy/Learning Rate": "learning_rate", } self.update_dict = { "value_loss": self.total_value_loss, "policy_loss": self.policy_loss, "q1_loss": self.q1_loss, "q2_loss": self.q2_loss, "entropy_coef": self.ent_coef, "entropy": self.policy.entropy, "update_batch": self.update_batch_policy, "update_value": self.update_batch_value, "update_entropy": self.update_batch_entropy, "learning_rate": self.learning_rate, }