Beispiel #1
0
        def __init__(
            self,
            stream_names: List[str],
            sensor_specs: List[SensorSpec],
            network_settings: NetworkSettings,
            action_spec: ActionSpec,
        ):
            super().__init__()
            num_value_outs = max(sum(action_spec.discrete_branches), 1)
            num_action_ins = int(action_spec.continuous_size)

            self.q1_network = ValueNetwork(
                stream_names,
                sensor_specs,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
            self.q2_network = ValueNetwork(
                stream_names,
                sensor_specs,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
Beispiel #2
0
        def __init__(
            self,
            stream_names: List[str],
            observation_shapes: List[Tuple[int, ...]],
            network_settings: NetworkSettings,
            action_spec: ActionSpec,
        ):
            super().__init__()
            self.action_spec = action_spec
            if self.action_spec.is_continuous():
                self.act_size = self.action_spec.continuous_size
                num_value_outs = 1
                num_action_ins = self.act_size

            else:
                self.act_size = self.action_spec.discrete_branches
                num_value_outs = sum(self.act_size)
                num_action_ins = 0
            self.q1_network = ValueNetwork(
                stream_names,
                observation_shapes,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
            self.q2_network = ValueNetwork(
                stream_names,
                observation_shapes,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
        def __init__(
            self,
            stream_names: List[str],
            observation_shapes: List[Tuple[int, ...]],
            network_settings: NetworkSettings,
            action_spec: ActionSpec,
        ):
            super().__init__()
            num_value_outs = max(sum(action_spec.discrete_branches), 1)
            num_action_ins = int(action_spec.continuous_size)

            self.q1_network = ValueNetwork(
                stream_names,
                observation_shapes,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
            self.q2_network = ValueNetwork(
                stream_names,
                observation_shapes,
                network_settings,
                num_action_ins,
                num_value_outs,
            )
Beispiel #4
0
 def __init__(
     self,
     stream_names: List[str],
     observation_shapes: List[Tuple[int, ...]],
     network_settings: NetworkSettings,
     act_type: ActionType,
     act_size: List[int],
 ):
     super().__init__()
     if act_type == ActionType.CONTINUOUS:
         num_value_outs = 1
         num_action_ins = sum(act_size)
     else:
         num_value_outs = sum(act_size)
         num_action_ins = 0
     self.q1_network = ValueNetwork(
         stream_names,
         observation_shapes,
         network_settings,
         num_action_ins,
         num_value_outs,
     )
     self.q2_network = ValueNetwork(
         stream_names,
         observation_shapes,
         network_settings,
         num_action_ins,
         num_value_outs,
     )
Beispiel #5
0
def test_valuenetwork():
    torch.manual_seed(0)
    obs_size = 4
    num_outputs = 2
    network_settings = NetworkSettings()
    obs_spec = create_observation_specs_with_shapes([(obs_size, )])

    stream_names = [f"stream_name{n}" for n in range(4)]
    value_net = ValueNetwork(stream_names,
                             obs_spec,
                             network_settings,
                             outputs_per_stream=num_outputs)
    optimizer = torch.optim.Adam(value_net.parameters(), lr=3e-3)

    for _ in range(50):
        sample_obs = torch.ones((1, obs_size))
        values, _ = value_net([sample_obs])
        loss = 0
        for s_name in stream_names:
            assert values[s_name].shape == (1, num_outputs)
            # Try to force output to 1
            loss += torch.nn.functional.mse_loss(values[s_name],
                                                 torch.ones((1, num_outputs)))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for value in values.values():
        for _out in value.tolist():
            assert _out[0] == pytest.approx(1.0, abs=0.1)
Beispiel #6
0
    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
        """
        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
        The PPO optimizer has a value estimator and a loss function.
        :param policy: A TorchPolicy object that will be updated by this PPO Optimizer.
        :param trainer_params: Trainer parameters dictionary that specifies the
        properties of the trainer.
        """
        # Create the graph here to give more granular control of the TF graph to the Optimizer.

        super().__init__(policy, trainer_settings)
        reward_signal_configs = trainer_settings.reward_signals
        reward_signal_names = [
            key.value for key, _ in reward_signal_configs.items()
        ]

        if policy.shared_critic:
            self._critic = policy.actor
        else:
            self._critic = ValueNetwork(
                reward_signal_names,
                policy.behavior_spec.observation_specs,
                network_settings=trainer_settings.network_settings,
            )
            self._critic.to(default_device())

        params = list(self.policy.actor.parameters()) + list(
            self._critic.parameters())
        self.hyperparameters: PPOSettings = cast(
            PPOSettings, trainer_settings.hyperparameters)
        self.decay_learning_rate = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.decay_epsilon = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.epsilon,
            0.1,
            self.trainer_settings.max_steps,
        )
        self.decay_beta = ModelUtils.DecayedValue(
            self.hyperparameters.learning_rate_schedule,
            self.hyperparameters.beta,
            1e-5,
            self.trainer_settings.max_steps,
        )

        self.optimizer = torch.optim.Adam(
            params, lr=self.trainer_settings.hyperparameters.learning_rate)
        self.stats_name_to_update_name = {
            "Losses/Value Loss": "value_loss",
            "Losses/Policy Loss": "policy_loss",
        }

        self.stream_names = list(self.reward_signals.keys())
Beispiel #7
0
def test_actor_critic(lstm, shared):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None,
        normalize=True)
    obs_spec = create_observation_specs_with_shapes([(obs_size, )])
    act_size = 2
    mask = torch.ones([1, act_size * 2])
    stream_names = [f"stream_name{n}" for n in range(4)]
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    if shared:
        actor = critic = SharedActorCritic(obs_spec, network_settings,
                                           action_spec, stream_names,
                                           network_settings)
    else:
        actor = SimpleActor(obs_spec, network_settings, action_spec)
        critic = ValueNetwork(stream_names, obs_spec, network_settings)
    if lstm:
        sample_obs = torch.ones(
            (1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones(
            (1, network_settings.memory.sequence_length, actor.memory_size))
    else:
        sample_obs = torch.ones((1, obs_size))
        memories = torch.tensor([])
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
    value_out, memories_out = critic.critic_pass([sample_obs],
                                                 memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
            assert memories_out.shape == memories.shape
        else:
            assert value_out[stream].shape == (1, )

    # Test get action stats and_value
    action, log_probs, entropies, mem_out = actor.get_action_and_stats(
        [sample_obs], memories=memories, masks=mask)
    if lstm:
        assert action.continuous_tensor.shape == (64, 2)
    else:
        assert action.continuous_tensor.shape == (1, 2)

    assert len(action.discrete_list) == 2
    for _disc in action.discrete_list:
        if lstm:
            assert _disc.shape == (64, 1)
        else:
            assert _disc.shape == (1, 1)

    if mem_out is not None:
        assert mem_out.shape == memories.shape
Beispiel #8
0
    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
        hyperparameters: SACSettings = cast(SACSettings,
                                            trainer_params.hyperparameters)
        self.tau = hyperparameters.tau
        self.init_entcoef = hyperparameters.init_entcoef

        self.policy = policy
        policy_network_settings = policy.network_settings

        self.tau = hyperparameters.tau
        self.burn_in_ratio = 0.0

        # Non-exposed SAC parameters
        self.discrete_target_entropy_scale = 0.2  # Roughly equal to e-greedy 0.05
        self.continuous_target_entropy_scale = 1.0

        self.stream_names = list(self.reward_signals.keys())
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.gammas = [
            _val.gamma for _val in trainer_params.reward_signals.values()
        ]
        self.use_dones_in_backup = {
            name: int(not self.reward_signals[name].ignore_done)
            for name in self.stream_names
        }
        self._action_spec = self.policy.behavior_spec.action_spec

        self.value_network = TorchSACOptimizer.PolicyValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
            self._action_spec,
        )

        self.target_network = ValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
        )
        ModelUtils.soft_update(self.policy.actor_critic.critic,
                               self.target_network, 1.0)

        # We create one entropy coefficient per action, whether discrete or continuous.
        _disc_log_ent_coef = torch.nn.Parameter(
            torch.log(
                torch.as_tensor([self.init_entcoef] *
                                len(self._action_spec.discrete_branches))),
            requires_grad=True,
        )
        _cont_log_ent_coef = torch.nn.Parameter(torch.log(
            torch.as_tensor([self.init_entcoef])),
                                                requires_grad=True)
        self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
            discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef)
        _cont_target = (
            -1 * self.continuous_target_entropy_scale *
            np.prod(self._action_spec.continuous_size).astype(np.float32))
        _disc_target = [
            self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
            for i in self._action_spec.discrete_branches
        ]
        self.target_entropy = TorchSACOptimizer.TargetEntropy(
            continuous=_cont_target, discrete=_disc_target)
        policy_params = list(
            self.policy.actor_critic.network_body.parameters()) + list(
                self.policy.actor_critic.action_model.parameters())
        value_params = list(self.value_network.parameters()) + list(
            self.policy.actor_critic.critic.parameters())

        logger.debug("value_vars")
        for param in value_params:
            logger.debug(param.shape)
        logger.debug("policy_vars")
        for param in policy_params:
            logger.debug(param.shape)

        self.decay_learning_rate = ModelUtils.DecayedValue(
            hyperparameters.learning_rate_schedule,
            hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.policy_optimizer = torch.optim.Adam(
            policy_params, lr=hyperparameters.learning_rate)
        self.value_optimizer = torch.optim.Adam(
            value_params, lr=hyperparameters.learning_rate)
        self.entropy_optimizer = torch.optim.Adam(
            self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate)
        self._move_to_device(default_device())
Beispiel #9
0
    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
        hyperparameters: SACSettings = cast(SACSettings, trainer_params.hyperparameters)
        self.tau = hyperparameters.tau
        self.init_entcoef = hyperparameters.init_entcoef

        self.policy = policy
        self.act_size = policy.act_size
        policy_network_settings = policy.network_settings

        self.tau = hyperparameters.tau
        self.burn_in_ratio = 0.0

        # Non-exposed SAC parameters
        self.discrete_target_entropy_scale = 0.2  # Roughly equal to e-greedy 0.05
        self.continuous_target_entropy_scale = 1.0

        self.stream_names = list(self.reward_signals.keys())
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()]
        self.use_dones_in_backup = {
            name: int(not self.reward_signals[name].ignore_done)
            for name in self.stream_names
        }

        # Critics should have 1/2 of the memory of the policy
        critic_memory = policy_network_settings.memory
        if critic_memory is not None:
            critic_memory = attr.evolve(
                critic_memory, memory_size=critic_memory.memory_size // 2
            )
        value_network_settings = attr.evolve(
            policy_network_settings, memory=critic_memory
        )

        self.value_network = TorchSACOptimizer.PolicyValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.observation_shapes,
            value_network_settings,
            self.policy.behavior_spec.action_type,
            self.act_size,
        )

        self.target_network = ValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.observation_shapes,
            value_network_settings,
        )
        self.soft_update(self.policy.actor_critic.critic, self.target_network, 1.0)

        self._log_ent_coef = torch.nn.Parameter(
            torch.log(torch.as_tensor([self.init_entcoef] * len(self.act_size))),
            requires_grad=True,
        )
        if self.policy.use_continuous_act:
            self.target_entropy = torch.as_tensor(
                -1
                * self.continuous_target_entropy_scale
                * np.prod(self.act_size[0]).astype(np.float32)
            )
        else:
            self.target_entropy = [
                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
                for i in self.act_size
            ]

        policy_params = list(self.policy.actor_critic.network_body.parameters()) + list(
            self.policy.actor_critic.distribution.parameters()
        )
        value_params = list(self.value_network.parameters()) + list(
            self.policy.actor_critic.critic.parameters()
        )

        logger.debug("value_vars")
        for param in value_params:
            logger.debug(param.shape)
        logger.debug("policy_vars")
        for param in policy_params:
            logger.debug(param.shape)

        self.decay_learning_rate = ModelUtils.DecayedValue(
            hyperparameters.learning_rate_schedule,
            hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.policy_optimizer = torch.optim.Adam(
            policy_params, lr=hyperparameters.learning_rate
        )
        self.value_optimizer = torch.optim.Adam(
            value_params, lr=hyperparameters.learning_rate
        )
        self.entropy_optimizer = torch.optim.Adam(
            [self._log_ent_coef], lr=hyperparameters.learning_rate
        )