Ejemplo n.º 1
0
    def init(
        self,
        discrete_state,
        discrete_action,
        dim_state,
        dim_action,
        deterministic=False,
    ):

        self.num_states, self.dim_state = ((dim_state,
                                            ()) if discrete_state else
                                           (-1, (dim_state, )))

        self.num_actions, self.dim_action = ((dim_action,
                                              ()) if discrete_action else
                                             (-1, (dim_action, )))

        self.policy = NNPolicy(
            dim_state=self.dim_state,
            dim_action=self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
            layers=[32, 32],
            deterministic=deterministic,
        )
Ejemplo n.º 2
0
def get_default_policy(environment, function_approximation):
    """Get default policy."""
    if function_approximation == "tabular":
        policy = TabularPolicy.default(environment)
    elif function_approximation == "linear":
        policy = NNPolicy.default(environment, layers=[200])
        freeze_hidden_layers(policy)
    else:
        policy = NNPolicy.default(environment)
    return policy
Ejemplo n.º 3
0
 def default(cls, environment, *args, **kwargs):
     """See AbstractPolicy.default()."""
     true_policy = NNPolicy.default(environment, *args, **kwargs)
     hallucination_policy = NNPolicy.default(
         environment, dim_action=environment.dim_state, *args, **kwargs)
     hallucination_policy.action_scale = torch.ones(environment.dim_state)
     return cls(
         true_policy=true_policy,
         hallucination_policy=hallucination_policy,
         *args,
         **kwargs,
     )
Ejemplo n.º 4
0
 def default(
     cls,
     environment,
     critic=None,
     policy=None,
     lr=3e-4,
     deterministic=True,
     exploration_noise=None,
     policy_update_frequency=2,
     clip_gradient_val=10,
     *args,
     **kwargs,
 ):
     """See `AbstractAgent.default'."""
     if critic is None:
         critic = NNQFunction.default(environment)
     if policy is None:
         policy = NNPolicy.default(environment, deterministic=deterministic)
     optimizer = Adam(chain(policy.parameters(), critic.parameters()),
                      lr=lr)
     if exploration_noise is None:
         exploration_noise = OUNoise(dim=environment.dim_action)
     return super().default(
         environment=environment,
         critic=critic,
         policy=policy,
         optimizer=optimizer,
         exploration_noise=exploration_noise,
         policy_update_frequency=policy_update_frequency,
         clip_gradient_val=clip_gradient_val,
         *args,
         **kwargs,
     )
 def default(cls, environment, *args, **kwargs):
     """See AbstractValueFunction.default."""
     q_function = NNQFunction.default(environment, *args, **kwargs)
     policy = NNPolicy.default(environment, *args, **kwargs)
     return super().default(environment,
                            q_function=q_function,
                            policy=policy)
Ejemplo n.º 6
0
    def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size):
        self.init(discrete_state, False, dim_state, dim_action)
        policy = NNPolicy.from_nn(
            HomoGaussianNN(
                self.policy.nn.kwargs["in_dim"],
                self.policy.nn.kwargs["out_dim"],
                layers=[20, 20],
                biased_head=False,
            ),
            self.dim_state,
            self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
        )

        state = random_tensor(discrete_state, dim_state, batch_size)
        action = tensor_to_distribution(policy(state)).sample()
        embeddings = policy.embeddings(state)

        assert action.shape == torch.Size(
            [batch_size, dim_action] if batch_size else [dim_action])
        assert embeddings.shape == torch.Size(
            [batch_size, 20] if batch_size else [20])
        assert action.dtype is torch.get_default_dtype()
        assert embeddings.dtype is torch.get_default_dtype()
Ejemplo n.º 7
0
    def default(
        cls,
        environment,
        policy=None,
        critic=None,
        critic_lr=1e-3,
        actor_lr=3e-4,
        *args,
        **kwargs,
    ):
        """See `AbstractAgent.default'."""
        if policy is None:
            policy = NNPolicy.default(environment)
        if critic is None:
            critic = NNQFunction.default(environment)

        optimizer = Adam(
            [
                {"params": policy.parameters(), "lr": actor_lr},
                {"params": critic.parameters(), "lr": critic_lr},
            ]
        )

        return super().default(
            environment=environment,
            policy=policy,
            critic=critic,
            optimizer=optimizer,
            *args,
            **kwargs,
        )
Ejemplo n.º 8
0
    def default(
        cls,
        environment,
        critic=None,
        policy=None,
        lr=3e-4,
        policy_update_frequency=2,
        clip_gradient_val=10,
        *args,
        **kwargs,
    ):
        """See `AbstractAgent.default'."""
        if critic is None:
            critic = NNEnsembleQFunction.default(environment)
        if policy is None:
            policy = NNPolicy.default(environment)

        optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr)

        return super().default(
            environment,
            critic=critic,
            policy=policy,
            optimizer=optimizer,
            policy_update_frequency=policy_update_frequency,
            clip_gradient_val=clip_gradient_val,
            *args,
            **kwargs,
        )
Ejemplo n.º 9
0
    def test_goal(self, batch_size):
        goal = random_tensor(False, 3, None)
        policy = NNPolicy(dim_state=(4, ),
                          dim_action=(2, ),
                          layers=[32, 32],
                          goal=goal)
        state = random_tensor(False, 4, batch_size)
        pi = tensor_to_distribution(policy(state))
        action = pi.sample()
        assert action.shape == torch.Size(
            [batch_size, 2] if batch_size else [2])
        assert action.dtype is torch.get_default_dtype()

        other_goal = random_tensor(False, 3, None)
        policy.set_goal(other_goal)
        other_pi = tensor_to_distribution(policy(state))

        assert not torch.any(other_pi.mean == pi.mean)
Ejemplo n.º 10
0
    def init(
        self,
        discrete_state,
        discrete_action,
        dim_state,
        dim_action,
        num_heads,
        num_samples=1,
        layers=None,
        biased_head=True,
    ):
        self.num_states, self.dim_state = ((dim_state,
                                            ()) if discrete_state else
                                           (-1, (dim_state, )))
        self.num_actions, self.dim_action = ((dim_action,
                                              ()) if discrete_action else
                                             (-1, (dim_action, )))

        layers = layers if layers is not None else [32, 32]

        if num_heads is None:
            self.q_function = NNQFunction(
                dim_state=self.dim_state,
                dim_action=self.dim_action,
                num_states=self.num_states,
                num_actions=self.num_actions,
                layers=layers,
                biased_head=biased_head,
            )
        else:
            self.q_function = NNEnsembleQFunction(
                dim_state=self.dim_state,
                dim_action=self.dim_action,
                num_states=self.num_states,
                num_actions=self.num_actions,
                num_heads=num_heads,
                layers=layers,
                biased_head=biased_head,
            )

        self.policy = NNPolicy(
            dim_state=self.dim_state,
            dim_action=self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
            layers=layers,
            biased_head=biased_head,
        )

        self.value_function = IntegrateQValueFunction(
            q_function=self.q_function,
            policy=self.policy,
            num_samples=num_samples)
Ejemplo n.º 11
0
 def test_input_transform(self, batch_size):
     policy = NNPolicy(
         dim_state=(2, ),
         dim_action=(4, ),
         layers=[64, 64],
         input_transform=StateTransform(),
     )
     out = tensor_to_distribution(
         policy(random_tensor(False, 2, batch_size)))
     action = out.sample()
     assert action.shape == torch.Size(
         [batch_size, 4] if batch_size else [4])
     assert action.dtype is torch.get_default_dtype()
Ejemplo n.º 12
0
    def default(cls, environment, policy=None, critic=None, lr=3e-4, *args, **kwargs):
        """See `AbstractAgent.default'."""
        if critic is None:
            critic = NNEnsembleQFunction.default(environment, jit_compile=False)
        if policy is None:
            policy = NNPolicy.default(environment, jit_compile=False)

        optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr)

        return super().default(
            environment,
            critic=critic,
            policy=policy,
            optimizer=optimizer,
            *args,
            **kwargs,
        )
Ejemplo n.º 13
0
    def init(
        self,
        discrete_state,
        discrete_action,
        dim_state,
        dim_action,
        deterministic=False,
        goal=None,
    ):

        self.num_states, self.dim_state = (
            (dim_state, ()) if discrete_state else (-1, (dim_state,))
        )

        self.num_actions, self.dim_action = (
            (dim_action, ()) if discrete_action else (-1, (dim_action,))
        )

        if discrete_state:
            base_dim = 1
        else:
            base_dim = self.dim_state[0]

        if discrete_action:
            base_dim += 1
        else:
            base_dim += self.dim_action[0]

        base_policy = NNPolicy(
            dim_state=self.dim_state,
            dim_action=(base_dim,),
            num_states=self.num_states,
            num_actions=self.num_actions,
            layers=[32, 32],
            deterministic=deterministic,
            goal=goal,
        )

        self.policy = DerivedPolicy(base_policy, self.dim_action)
Ejemplo n.º 14
0
def _get_nn_policy(dim_state,
                   dim_action,
                   params,
                   action_scale,
                   input_transform=None):
    if params.exploration == "optimistic":
        dim_action = (dim_action[0] + dim_state[0], )

    policy = NNPolicy(
        dim_state=dim_state,
        dim_action=dim_action,
        layers=params.policy_layers,
        biased_head=not params.policy_unbiased_head,
        non_linearity=params.policy_non_linearity,
        squashed_output=True,
        input_transform=input_transform,
        action_scale=action_scale,
        deterministic=params.policy_deterministic,
        tau=params.policy_tau,
    )
    params.update({"policy": policy.__class__.__name__})
    # policy = torch.jit.script(policy)
    return policy
Ejemplo n.º 15
0
    def default(cls,
                environment,
                critic=None,
                policy=None,
                lr=5e-3,
                *args,
                **kwargs):
        """See `AbstractAgent.default'."""
        if critic is None:
            critic = NNValueFunction.default(environment)
        if policy is None:
            policy = NNPolicy.default(environment)

        optimizer = Adam(critic.parameters(), lr=lr)

        return super().default(
            environment,
            policy=policy,
            critic=critic,
            optimizer=optimizer,
            *args,
            **kwargs,
        )
Ejemplo n.º 16
0
    def default(cls,
                environment,
                policy=None,
                critic=None,
                lr=5e-4,
                *args,
                **kwargs):
        """See `AbstractAgent.default'."""
        if critic is None:
            critic = NNQFunction.default(environment)
        if policy is None:
            policy = NNPolicy.default(environment, layers=[100, 100])

        optimizer = Adam(chain(policy.parameters(), critic.parameters()),
                         lr=lr)

        return super().default(
            environment,
            policy=policy,
            critic=critic,
            optimizer=optimizer,
            *args,
            **kwargs,
        )
Ejemplo n.º 17
0
class TestMLPPolicy(object):
    def init(
        self,
        discrete_state,
        discrete_action,
        dim_state,
        dim_action,
        deterministic=False,
    ):

        self.num_states, self.dim_state = ((dim_state,
                                            ()) if discrete_state else
                                           (-1, (dim_state, )))

        self.num_actions, self.dim_action = ((dim_action,
                                              ()) if discrete_action else
                                             (-1, (dim_action, )))

        self.policy = NNPolicy(
            dim_state=self.dim_state,
            dim_action=self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
            layers=[32, 32],
            deterministic=deterministic,
        )

    def test_property_values(self, discrete_state, discrete_action, dim_state,
                             dim_action):
        self.init(discrete_state, discrete_action, dim_state, dim_action)
        assert (self.num_states if self.num_states is not None else
                -1) == self.policy.num_states
        assert (self.num_actions if self.num_actions is not None else
                -1) == self.policy.num_actions
        assert discrete_state == self.policy.discrete_state
        assert discrete_action == self.policy.discrete_action

    def test_random_action(self, discrete_state, discrete_action, dim_state,
                           dim_action):
        self.init(discrete_state, discrete_action, dim_state, dim_action)

        distribution = tensor_to_distribution(self.policy.random())
        sample = distribution.sample()

        if distribution.has_enumerate_support:  # Discrete
            assert distribution.logits.shape == (self.num_actions, )
            assert sample.shape == ()
        else:  # Continuous
            assert distribution.mean.shape == self.dim_action
            assert sample.shape == (dim_action, )

    def test_forward(
        self,
        discrete_state,
        discrete_action,
        dim_state,
        dim_action,
        batch_size,
        deterministic,
    ):
        self.init(discrete_state, discrete_action, dim_state, dim_action,
                  deterministic)
        state = random_tensor(discrete_state, dim_state, batch_size)
        distribution = tensor_to_distribution(self.policy(state))
        sample = distribution.sample()

        if distribution.has_enumerate_support:  # Discrete
            assert isinstance(distribution, Categorical)
            if batch_size:
                assert distribution.logits.shape == (batch_size,
                                                     self.num_actions)
                assert sample.shape == (batch_size, )
            else:
                assert distribution.logits.shape == (self.num_actions, )
                assert sample.shape == ()
        else:  # Continuous
            if deterministic:
                assert isinstance(distribution, Delta)
            else:
                assert isinstance(distribution, MultivariateNormal)

            if batch_size:
                assert distribution.mean.shape == (
                    batch_size, ) + self.dim_action
                if not deterministic:
                    assert distribution.covariance_matrix.shape == (
                        batch_size,
                        self.dim_action[0],
                        self.dim_action[0],
                    )
                assert sample.shape == (batch_size, dim_action)
            else:
                assert distribution.mean.shape == self.dim_action
                if not deterministic:
                    assert distribution.covariance_matrix.shape == (
                        self.dim_action[0],
                        self.dim_action[0],
                    )
                assert sample.shape == (dim_action, )

    def test_embeddings(self, discrete_state, discrete_action, dim_state,
                        dim_action, batch_size):
        self.init(discrete_state, discrete_action, dim_state, dim_action)
        state = random_tensor(discrete_state, dim_state, batch_size)
        embeddings = self.policy.embeddings(state)

        assert embeddings.shape == torch.Size(
            [batch_size, 33] if batch_size else [33])
        assert embeddings.dtype is torch.get_default_dtype()

    def test_input_transform(self, batch_size):
        policy = NNPolicy(
            dim_state=(2, ),
            dim_action=(4, ),
            layers=[64, 64],
            input_transform=StateTransform(),
        )
        out = tensor_to_distribution(
            policy(random_tensor(False, 2, batch_size)))
        action = out.sample()
        assert action.shape == torch.Size(
            [batch_size, 4] if batch_size else [4])
        assert action.dtype is torch.get_default_dtype()

    def test_goal(self, batch_size):
        goal = random_tensor(False, 3, None)
        policy = NNPolicy(dim_state=(4, ),
                          dim_action=(2, ),
                          layers=[32, 32],
                          goal=goal)
        state = random_tensor(False, 4, batch_size)
        pi = tensor_to_distribution(policy(state))
        action = pi.sample()
        assert action.shape == torch.Size(
            [batch_size, 2] if batch_size else [2])
        assert action.dtype is torch.get_default_dtype()

        other_goal = random_tensor(False, 3, None)
        policy.set_goal(other_goal)
        other_pi = tensor_to_distribution(policy(state))

        assert not torch.any(other_pi.mean == pi.mean)

    def test_from_other(self, discrete_state, discrete_action, dim_state,
                        dim_action):
        self.init(discrete_state, discrete_action, dim_state, dim_action)
        _test_from_other(self.policy, NNPolicy)
        _test_from_other_with_copy(self.policy, NNPolicy)

    def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size):
        self.init(discrete_state, False, dim_state, dim_action)
        policy = NNPolicy.from_nn(
            HomoGaussianNN(
                self.policy.nn.kwargs["in_dim"],
                self.policy.nn.kwargs["out_dim"],
                layers=[20, 20],
                biased_head=False,
            ),
            self.dim_state,
            self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
        )

        state = random_tensor(discrete_state, dim_state, batch_size)
        action = tensor_to_distribution(policy(state)).sample()
        embeddings = policy.embeddings(state)

        assert action.shape == torch.Size(
            [batch_size, dim_action] if batch_size else [dim_action])
        assert embeddings.shape == torch.Size(
            [batch_size, 20] if batch_size else [20])
        assert action.dtype is torch.get_default_dtype()
        assert embeddings.dtype is torch.get_default_dtype()
Ejemplo n.º 18
0
    def default(
        cls,
        environment,
        gamma=0.99,
        exploration_steps=0,
        exploration_episodes=0,
        tensorboard=False,
        test=False,
    ):
        """See `AbstractAgent.default'."""
        model = EnsembleModel(
            dim_state=environment.dim_state,
            dim_action=environment.dim_action,
            num_heads=5,
            layers=[200, 200],
            biased_head=False,
            non_linearity="ReLU",
            input_transform=None,
            deterministic=False,
        )
        dynamical_model = TransformedModel(model, list())
        model_optimizer = Adam(dynamical_model.parameters(), lr=5e-4)

        reward_model = QuadraticReward(
            torch.eye(environment.dim_state[0]),
            torch.eye(environment.dim_action[0]),
            goal=environment.goal,
        )

        policy = NNPolicy(
            dim_state=environment.dim_state,
            dim_action=environment.dim_action,
            layers=[100, 100],
            biased_head=True,
            non_linearity="ReLU",
            squashed_output=True,
            input_transform=None,
            action_scale=environment.action_scale,
            goal=environment.goal,
            deterministic=False,
            tau=5e-3,
        )

        value_function = NNValueFunction(
            dim_state=environment.dim_state,
            layers=[200, 200],
            biased_head=True,
            non_linearity="ReLU",
            input_transform=None,
            tau=5e-3,
        )

        optimizer = Adam(chain(policy.parameters(),
                               value_function.parameters()),
                         lr=5e-3)

        return cls(
            model_optimizer,
            policy,
            value_function,
            dynamical_model,
            reward_model,
            optimizer,
            mpo_value_learning_criterion=loss.MSELoss,
            termination_model=None,
            initial_distribution=None,
            plan_horizon=1,
            plan_samples=8,
            plan_elites=1,
            max_memory=10000,
            model_learn_batch_size=64,
            model_learn_num_iter=4 if test else 30,
            bootstrap=True,
            mpo_epsilon=0.1,
            mpo_epsilon_mean=0.1,
            mpo_epsilon_var=0.0001,
            mpo_regularization=False,
            mpo_num_iter=5 if test else 200,
            mpo_gradient_steps=50,
            mpo_batch_size=None,
            mpo_num_action_samples=15,
            mpo_target_update_frequency=4,
            sim_num_steps=5 if test else 200,
            sim_initial_states_num_trajectories=8,
            sim_initial_dist_num_trajectories=0,
            sim_memory_num_trajectories=0,
            sim_max_memory=100000,
            sim_num_subsample=1,
            sim_refresh_interval=1,
            thompson_sampling=False,
            gamma=gamma,
            exploration_steps=exploration_steps,
            exploration_episodes=exploration_episodes,
            tensorboard=tensorboard,
            comment=environment.name,
        )