Exemple #1
0
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config = TrainerSettings()
    trainer_config.network_settings.memory = (NetworkSettings.MemorySettings()
                                              if use_rnn else None)
    policy = TFPolicy(
        0,
        mock_behavior_specs,
        trainer_config,
        "test",
        False,
        tanhresample,
        tanhresample,
    )
    with policy.graph.as_default():
        bc_module = BCModule(
            policy,
            policy_learning_rate=trainer_config.hyperparameters.learning_rate,
            default_batch_size=trainer_config.hyperparameters.batch_size,
            default_num_epoch=3,
            settings=bc_settings,
        )
    policy.initialize_or_load(
    )  # Normally the optimizer calls this after the BCModule is created
    return bc_module
Exemple #2
0
 def _load_graph(self,
                 policy: TFPolicy,
                 model_path: str,
                 reset_global_steps: bool = False) -> None:
     # This prevents normalizer init up from executing on load
     policy.first_normalization_update = False
     with policy.graph.as_default():
         logger.info(f"Loading model from {model_path}.")
         ckpt = tf.train.get_checkpoint_state(model_path)
         if ckpt is None:
             raise UnityPolicyException(
                 "The model {} could not be loaded. Make "
                 "sure you specified the right "
                 "--run-id and that the previous run you are loading from had the same "
                 "behavior names.".format(model_path))
         if self.tf_saver:
             try:
                 self.tf_saver.restore(policy.sess,
                                       ckpt.model_checkpoint_path)
             except tf.errors.NotFoundError:
                 raise UnityPolicyException(
                     "The model {} was found but could not be loaded. Make "
                     "sure the model is from the same version of ML-Agents, has the same behavior parameters, "
                     "and is using the same trainer configuration as the current run."
                     .format(model_path))
         self._check_model_version(__version__)
         if reset_global_steps:
             policy.set_step(0)
             logger.info(
                 "Starting training from step 0 and saving to {}.".format(
                     self.model_path))
         else:
             logger.info(
                 f"Resuming training from step {policy.get_current_step()}."
             )
Exemple #3
0
    def add_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                   policy: TFPolicy) -> None:
        """
        Adds policy to trainer. The first policy encountered sets the wrapped
        trainer team.  This is to ensure that all agents from the same multi-agent
        team are grouped. All policies associated with this team are added to the
        wrapped trainer to be trained.
        :param name_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
        name_behavior_id = parsed_behavior_id.behavior_id
        team_id = parsed_behavior_id.team_id
        self.controller.subscribe_team_id(team_id, self)
        self.policies[name_behavior_id] = policy
        policy.create_tf_graph()

        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
        # for saving/swapping snapshots
        policy.init_load_weights()

        # First policy or a new agent on the same team encountered
        if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
            self.current_policy_snapshot[
                parsed_behavior_id.brain_name] = policy.get_weights()

            self._save_snapshot(
            )  # Need to save after trainer initializes policy
            self.trainer.add_policy(parsed_behavior_id, policy)
            self._learning_team = self.controller.get_learning_team
            self.wrapped_trainer_team = team_id
Exemple #4
0
    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
        """
        Adds policy to trainer. For the first policy added, add a trainer
        to the policy and set the learning behavior name to name_behavior_id.
        :param name_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
        self.policies[name_behavior_id] = policy
        policy.create_tf_graph()

        # First policy encountered
        if not self.learning_behavior_name:
            weights = policy.get_weights()
            self.current_policy_snapshot = weights
            self.trainer.add_policy(name_behavior_id, policy)
            self._save_snapshot(
                policy)  # Need to save after trainer initializes policy
            self.learning_behavior_name = name_behavior_id
            behavior_id_parsed = BehaviorIdentifiers.from_name_behavior_id(
                self.learning_behavior_name)
            team_id = behavior_id_parsed.behavior_ids["team"]
            self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY_TEAM,
                                              team_id)
        else:
            # for saving/swapping snapshots
            policy.init_load_weights()
def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id))
    run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id))

    np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
def test_normalizer_after_load(tmp_path):
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)

    trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True))
    policy = TFPolicy(0, behavior_spec, trainer_params)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert steps == 6
    assert mean[0] == 0.5
    assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
    # Save ckpt and load into another policy
    path1 = os.path.join(tmp_path, "runid1")
    model_saver = TFModelSaver(trainer_params, path1)
    model_saver.register(policy)
    mock_brain_name = "MockBrain"
    model_saver.save_checkpoint(mock_brain_name, 6)
    assert len(os.listdir(tmp_path)) > 0
    policy1 = TFPolicy(0, behavior_spec, trainer_params)
    model_saver = TFModelSaver(trainer_params, path1, load=True)
    model_saver.register(policy1)
    model_saver.initialize_or_load(policy1)

    # Make another update to new policy, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy1.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy1.sess.run(
        [policy1.normalization_steps, policy1.running_mean, policy1.running_variance]
    )

    assert steps == 16
    assert mean[0] == 0.8125
    assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
Exemple #7
0
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    model_path: str = "",
    load: bool = False,
    seed: int = 0,
) -> TFPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
    policy = TFPolicy(
        seed, mock_spec, trainer_settings, model_path=model_path, load=load
    )
    return policy
Exemple #8
0
    def create_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
    ) -> TFPolicy:
        policy = TFPolicy(
            self.seed,
            behavior_spec,
            self.trainer_settings,
            self.artifact_path,
            self.load,
            tanh_squash=True,
            reparameterize=True,
            create_tf_graph=False,
        )
        # Load the replay buffer if load
        if self.load and self.checkpoint_replay_buffer:
            try:
                self.load_replay_buffer()
            except (AttributeError, FileNotFoundError):
                logger.warning(
                    "Replay buffer was unable to load, starting from scratch."
                )
            logger.debug(
                "Loaded update buffer with {} sequences".format(
                    self.update_buffer.num_experiences
                )
            )

        return policy
Exemple #9
0
 def add_policy(
     self, parsed_behavior_id: BehaviorIdentifiers, policy: TFPolicy
 ) -> None:
     """
     Adds policy to trainer.
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(
                 self.__class__.__name__
             )
         )
     self.policy = policy
     self.policies[parsed_behavior_id.behavior_id] = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     # Assume steps were updated at the correct ratio before
     self.update_steps = int(max(1, self.step / self.steps_per_update))
     self.reward_signal_update_steps = int(
         max(1, self.step / self.reward_signal_steps_per_update)
     )
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = trainer_config
    trainer_settings.reward_signals = reward_signal_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TFPolicy(0,
                      mock_specs,
                      trainer_settings,
                      "test",
                      False,
                      create_tf_graph=False)
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
    else:
        optimizer = PPOOptimizer(policy, trainer_settings)
    optimizer.policy.initialize()
    return optimizer
Exemple #11
0
 def _save_snapshot(self, policy: TFPolicy) -> None:
     weights = policy.get_weights()
     try:
         self.policy_snapshots[self.snapshot_counter] = weights
     except IndexError:
         self.policy_snapshots.append(weights)
     self.policy_elos[self.snapshot_counter] = self.current_elo
     self.snapshot_counter = (self.snapshot_counter + 1) % self.window
Exemple #12
0
 def initialize_or_load(self, policy: Optional[TFPolicy] = None) -> None:
     # If there is an initialize path, load from that. Else, load from the set model path.
     # If load is set to True, don't reset steps to 0. Else, do. This allows a user to,
     # e.g., resume from an initialize path.
     if policy is None:
         policy = self.policy
     policy = cast(TFPolicy, policy)
     reset_steps = not self.load
     if self.initialize_path is not None:
         self._load_graph(policy,
                          self.initialize_path,
                          reset_global_steps=reset_steps)
     elif self.load:
         self._load_graph(policy,
                          self.model_path,
                          reset_global_steps=reset_steps)
     else:
         policy.initialize()
     TFPolicy.broadcast_global_variables(0)
Exemple #13
0
    def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
        """
        Adds policy to trainer. For the first policy added, add a trainer
        to the policy and set the learning behavior name to name_behavior_id.
        :param name_behavior_id: Behavior ID that the policy should belong to.
        :param policy: Policy to associate with name_behavior_id.
        """
        self.policies[name_behavior_id] = policy
        policy.create_tf_graph()

        # First policy encountered
        if not self.learning_behavior_name:
            weights = policy.get_weights()
            self.current_policy_snapshot = weights
            self.trainer.add_policy(name_behavior_id, policy)
            self._save_snapshot(policy)  # Need to save after trainer initializes policy
            self.learning_behavior_name = name_behavior_id
        else:
            # for saving/swapping snapshots
            policy.init_load_weights()
Exemple #14
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TFPolicy(0,
                      mock_brain,
                      trainer_settings,
                      "test",
                      False,
                      create_tf_graph=False)
    optimizer = SACOptimizer(policy, trainer_settings)
    policy.initialize()
    return optimizer
Exemple #15
0
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW)
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = TFPolicy(
        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = PPOOptimizer(policy, trainer_settings)
    policy.initialize()
    return optimizer
def test_normalization():
    behavior_spec = mb.setup_test_behavior_specs(use_discrete=True,
                                                 use_visual=False,
                                                 vector_action_space=[2],
                                                 vector_obs_space=1)

    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        "testdir",
        False,
    )

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 6
    assert mean[0] == 0.5
    # Note: variance is divided by number of steps, and initialized to 1 to avoid
    # divide by 0. The right answer is 0.25
    assert (variance[0] - 1) / steps == 0.25

    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 16
    assert mean[0] == 0.8125
    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
Exemple #17
0
 def _check_model_version(self, version: str) -> None:
     """
     Checks whether the model being loaded was created with the same version of
     ML-Agents, and throw a warning if not so.
     """
     if self.policy is not None and self.policy.version_tensors is not None:
         loaded_ver = tuple(
             num.eval(session=self.sess)
             for num in self.policy.version_tensors)
         if loaded_ver != TFPolicy._convert_version_string(version):
             logger.warning(
                 f"The model checkpoint you are loading from was saved with ML-Agents version "
                 f"{loaded_ver[0]}.{loaded_ver[1]}.{loaded_ver[2]} but your current ML-Agents"
                 f"version is {version}. Model may not behave properly.")
Exemple #18
0
    def create_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                      behavior_spec: BehaviorSpec) -> TFPolicy:
        """
        Creates a PPO policy to trainers list of policies.
        :param behavior_spec: specifications for policy construction
        :return policy
        """
        policy = TFPolicy(
            self.seed,
            behavior_spec,
            self.trainer_settings,
            condition_sigma_on_obs=False,  # Faster training for PPO
            create_tf_graph=
            False,  # We will create the TF graph in the Optimizer
        )

        return policy
Exemple #19
0
 def add_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param parsed_behavior_id: Behavior identifiers that the policy should belong to.
     :param policy: Policy to associate with name_behavior_id.
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(self.__class__.__name__))
     self.policy = policy
     self.policies[parsed_behavior_id.behavior_id] = policy
     self.optimizer = PPOOptimizer(self.policy, self.trainer_settings)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
Exemple #20
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
Exemple #21
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
Exemple #22
0
 def add_policy(self, name_behavior_id: str, policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param name_behavior_id: Behavior ID that the policy should belong to.
     :param policy: Policy to associate with name_behavior_id.
     """
     if self.policy:
         logger.warning(
             "add_policy has been called twice. {} is not a multi-agent trainer"
             .format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-NNPolicy passed to PPOTrainer.add_policy()")
     self.policy = policy
     self.optimizer = PPOOptimizer(self.policy, self.trainer_parameters)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     self.next_summary_step = self._get_next_summary_step()
Exemple #23
0
 def create_tf_policy(
     self,
     parsed_behavior_id: BehaviorIdentifiers,
     behavior_spec: BehaviorSpec,
     create_graph: bool = False,
 ) -> TFPolicy:
     """
     Creates a policy with a Tensorflow backend and PPO hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :param create_graph: whether to create the Tensorflow graph on construction
     :return policy
     """
     policy = TFPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         condition_sigma_on_obs=False,  # Faster training for PPO
         create_tf_graph=create_graph,
     )
     return policy
Exemple #24
0
 def create_tf_policy(
     self,
     parsed_behavior_id: BehaviorIdentifiers,
     behavior_spec: BehaviorSpec,
     create_graph: bool = False,
 ) -> TFPolicy:
     """
     Creates a policy with a Tensorflow backend and SAC hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :param create_graph: whether to create the Tensorflow graph on construction
     :return policy
     """
     policy = TFPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         tanh_squash=True,
         reparameterize=True,
         create_tf_graph=create_graph,
     )
     self.maybe_load_replay_buffer()
     return policy
Exemple #25
0
 def add_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                policy: TFPolicy) -> None:
     """
     Adds policy to trainer.
     :param brain_parameters: specifications for policy construction
     """
     if self.policy:
         logger.warning(
             "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
                 train adversarial games.".format(self.__class__.__name__))
     if not isinstance(policy, NNPolicy):
         raise RuntimeError(
             "Non-SACPolicy passed to SACTrainer.add_policy()")
     self.policy = policy
     self.optimizer = SACOptimizer(self.policy, self.trainer_settings)
     for _reward_signal in self.optimizer.reward_signals.keys():
         self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
     # Needed to resume loads properly
     self.step = policy.get_current_step()
     # Assume steps were updated at the correct ratio before
     self.update_steps = int(max(1, self.step / self.steps_per_update))
     self.reward_signal_update_steps = int(
         max(1, self.step / self.reward_signal_steps_per_update))
Exemple #26
0
def test_large_normalization():
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    # Taken from Walker seed 3713 which causes NaN without proper initialization
    large_obs1 = [
        1800.00036621,
        1799.96972656,
        1800.01245117,
        1800.07214355,
        1800.02758789,
        1799.98303223,
        1799.88647461,
        1799.89575195,
        1800.03479004,
        1800.14025879,
        1800.17675781,
        1800.20581055,
        1800.33740234,
        1800.36450195,
        1800.43457031,
        1800.45544434,
        1800.44604492,
        1800.56713867,
        1800.73901367,
    ]
    large_obs2 = [
        1799.99975586,
        1799.96679688,
        1799.92980957,
        1799.89550781,
        1799.93774414,
        1799.95300293,
        1799.94067383,
        1799.92993164,
        1799.84057617,
        1799.69873047,
        1799.70605469,
        1799.82849121,
        1799.85095215,
        1799.76977539,
        1799.78283691,
        1799.76708984,
        1799.67163086,
        1799.59191895,
        1799.5135498,
        1799.45556641,
        1799.3717041,
    ]
    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        "testdir",
        False,
    )
    time_horizon = len(large_obs1)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )
    assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1, dtype=np.float32), abs=0.01
    )

    time_horizon = len(large_obs2)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert mean[0] == pytest.approx(
        np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )
Exemple #27
0
    def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)

                lr = float(trainer_params["learning_rate"])
                lr_schedule = LearningRateSchedule(
                    trainer_params.get("learning_rate_schedule", "linear"))
                h_size = int(trainer_params["hidden_units"])
                epsilon = float(trainer_params["epsilon"])
                beta = float(trainer_params["beta"])
                max_step = float(trainer_params["max_steps"])
                num_layers = int(trainer_params["num_layers"])
                vis_encode_type = EncoderType(
                    trainer_params.get("vis_encode_type", "simple"))
                self.burn_in_ratio = float(
                    trainer_params.get("burn_in_ratio", 0.0))

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_learning_rate(
                    lr_schedule, lr, self.policy.global_step, int(max_step))
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
            })

            self.policy.initialize_or_load()
Exemple #28
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: Brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
            set higher to explore more.
        :return: a sub-class of PPOAgent tailored to the environment.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers: Number of hidden layers between encoded input and policy & value layers
        :param tau: Strength of soft-Q update.
        :param m_size: Size of brain memory.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope(""):
                super().__init__(policy, trainer_params)
                hyperparameters: SACSettings = cast(
                    SACSettings, trainer_params.hyperparameters)
                lr = hyperparameters.learning_rate
                lr_schedule = hyperparameters.learning_rate_schedule
                max_step = trainer_params.max_steps
                self.tau = hyperparameters.tau
                self.init_entcoef = hyperparameters.init_entcoef

                self.policy = policy
                self.act_size = policy.act_size
                policy_network_settings = policy.network_settings
                h_size = policy_network_settings.hidden_units
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type

                self.tau = hyperparameters.tau
                self.burn_in_ratio = 0.0

                # Non-exposed SAC parameters
                self.discrete_target_entropy_scale = (
                    0.2  # Roughly equal to e-greedy 0.05
                )
                self.continuous_target_entropy_scale = 1.0

                stream_names = list(self.reward_signals.keys())
                # Use to reduce "survivor bonus" when using Curiosity or GAIL.
                self.gammas = [
                    _val.gamma
                    for _val in trainer_params.reward_signals.values()
                ]
                self.use_dones_in_backup = {
                    name: tf.Variable(1.0)
                    for name in stream_names
                }
                self.disable_use_dones = {
                    name: self.use_dones_in_backup[name].assign(0.0)
                    for name in stream_names
                }

                if num_layers < 1:
                    num_layers = 1

                self.target_init_op: List[tf.Tensor] = []
                self.target_update_op: List[tf.Tensor] = []
                self.update_batch_policy: Optional[tf.Operation] = None
                self.update_batch_value: Optional[tf.Operation] = None
                self.update_batch_entropy: Optional[tf.Operation] = None

                self.policy_network = SACPolicyNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 3x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                self.target_network = SACTargetNetwork(
                    policy=self.policy,
                    m_size=self.policy.m_size,  # 1x policy.m_size
                    h_size=h_size,
                    normalize=self.policy.normalize,
                    use_recurrent=self.policy.use_recurrent,
                    num_layers=num_layers,
                    stream_names=stream_names,
                    vis_encode_type=vis_encode_type,
                )
                # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
                self.m_size = 3 * self.policy.m_size
                self._create_inputs_and_outputs()
                self.learning_rate = ModelUtils.create_schedule(
                    lr_schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy_network.q1_heads,
                    self.policy_network.q2_heads,
                    lr,
                    int(max_step),
                    stream_names,
                    discrete=not self.policy.use_continuous_act,
                )
                self._create_sac_optimizer_ops()

                self.selected_actions = (self.policy.selected_actions
                                         )  # For GAIL and other reward signals
                if self.policy.normalize:
                    target_update_norm = self.target_network.copy_normalization(
                        self.policy.running_mean,
                        self.policy.running_variance,
                        self.policy.normalization_steps,
                    )
                    # Update the normalization of the optimizer when the policy does.
                    self.policy.update_normalization_op = tf.group([
                        self.policy.update_normalization_op, target_update_norm
                    ])

        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
            "Losses/Q1 Loss": "q1_loss",
            "Losses/Q2 Loss": "q2_loss",
            "Policy/Entropy Coeff": "entropy_coef",
            "Policy/Learning Rate": "learning_rate",
        }

        self.update_dict = {
            "value_loss": self.total_value_loss,
            "policy_loss": self.policy_loss,
            "q1_loss": self.q1_loss,
            "q2_loss": self.q2_loss,
            "entropy_coef": self.ent_coef,
            "update_batch": self.update_batch_policy,
            "update_value": self.update_batch_value,
            "update_entropy": self.update_batch_entropy,
            "learning_rate": self.learning_rate,
        }
Exemple #29
0
def test_convert_version_string():
    result = TFPolicy._convert_version_string("200.300.100")
    assert result == (200, 300, 100)
    # Test dev versions
    result = TFPolicy._convert_version_string("200.300.100.dev0")
    assert result == (200, 300, 100)
Exemple #30
0
    def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.
        policy.create_tf_graph()

        with policy.graph.as_default():
            with tf.variable_scope("optimizer/"):
                super().__init__(policy, trainer_params)
                hyperparameters: PPOSettings = cast(
                    PPOSettings, trainer_params.hyperparameters)
                lr = float(hyperparameters.learning_rate)
                self._schedule = hyperparameters.learning_rate_schedule
                epsilon = float(hyperparameters.epsilon)
                beta = float(hyperparameters.beta)
                max_step = float(trainer_params.max_steps)

                policy_network_settings = policy.network_settings
                h_size = int(policy_network_settings.hidden_units)
                num_layers = policy_network_settings.num_layers
                vis_encode_type = policy_network_settings.vis_encode_type
                self.burn_in_ratio = 0.0

                self.stream_names = list(self.reward_signals.keys())

                self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
                self.grads = None
                self.update_batch: Optional[tf.Operation] = None

                self.stats_name_to_update_name = {
                    "Losses/Value Loss": "value_loss",
                    "Losses/Policy Loss": "policy_loss",
                    "Policy/Learning Rate": "learning_rate",
                    "Policy/Epsilon": "decay_epsilon",
                    "Policy/Beta": "decay_beta",
                }
                if self.policy.use_recurrent:
                    self.m_size = self.policy.m_size
                    self.memory_in = tf.placeholder(
                        shape=[None, self.m_size],
                        dtype=tf.float32,
                        name="recurrent_value_in",
                    )

                if num_layers < 1:
                    num_layers = 1
                if policy.use_continuous_act:
                    self._create_cc_critic(h_size, num_layers, vis_encode_type)
                else:
                    self._create_dc_critic(h_size, num_layers, vis_encode_type)

                self.learning_rate = ModelUtils.create_schedule(
                    self._schedule,
                    lr,
                    self.policy.global_step,
                    int(max_step),
                    min_value=1e-10,
                )
                self._create_losses(
                    self.policy.total_log_probs,
                    self.old_log_probs,
                    self.value_heads,
                    self.policy.entropy,
                    beta,
                    epsilon,
                    lr,
                    max_step,
                )
                self._create_ppo_optimizer_ops()

            self.update_dict.update({
                "value_loss": self.value_loss,
                "policy_loss": self.abs_policy_loss,
                "update_batch": self.update_batch,
                "learning_rate": self.learning_rate,
                "decay_epsilon": self.decay_epsilon,
                "decay_beta": self.decay_beta,
            })