コード例 #1
0
    def test_output_properties(self, out_dim, num_heads, batch_size,
                               deterministic):
        in_dim = (4, )
        net = Ensemble(in_dim,
                       out_dim,
                       num_heads=num_heads,
                       deterministic=deterministic)

        if batch_size is None:
            t = torch.randn(in_dim)
        else:
            t = torch.randn((batch_size, 2) + in_dim)

        o = tensor_to_distribution(net(t))
        assert isinstance(o, torch.distributions.MultivariateNormal)
        assert o.has_rsample
        assert not o.has_enumerate_support
        assert o.batch_shape == torch.Size((
            batch_size, 2) if batch_size is not None else ())

        net.set_prediction_strategy("set_head")
        net.set_head(0)
        o = tensor_to_distribution(net(t))
        if deterministic:
            assert isinstance(o, Delta)
        else:
            assert isinstance(o, torch.distributions.MultivariateNormal)
        assert o.batch_shape == torch.Size((
            batch_size, 2) if batch_size is not None else ())

        assert o.has_rsample
        assert not o.has_enumerate_support
コード例 #2
0
 def get_log_p_and_ope_weight(self, state, action):
     """Get log_p of a state-action and the off-pol weight w.r.t. the old policy."""
     pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params)
     pi_o = tensor_to_distribution(self.old_policy(state), **self.policy.dist_params)
     _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale)
     _, log_p_old = get_entropy_and_log_p(pi_o, action, self.policy.action_scale)
     ratio = torch.exp(log_p - log_p_old)
     return log_p, ratio
コード例 #3
0
 def test_output_shape(self, net, in_dim, out_dim, batch_size):
     net = torch.jit.script(net(in_dim, out_dim))
     if batch_size is None:
         t = torch.randn(in_dim)
         o = tensor_to_distribution(net(t)).sample()
         assert o.shape == torch.Size(out_dim)
     else:
         t = torch.randn((batch_size, ) + in_dim)
         o = tensor_to_distribution(net(t)).sample()
         assert o.shape == torch.Size((batch_size, ) + out_dim)
コード例 #4
0
ファイル: svg.py プロジェクト: sebimarkgraf/rllib
    def actor_loss(self, observation):
        """Use the model to compute the gradient loss."""
        state, action = observation.state, observation.action
        next_state, done = observation.next_state, observation.done

        # Infer eta.
        action_mean, action_chol = self.policy(state)
        with torch.no_grad():
            eta = torch.inverse(action_chol) @ (
                (action - action_mean).unsqueeze(-1))

        # Compute entropy and log_probability.
        pi = tensor_to_distribution((action_mean, action_chol))
        _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale)

        # Compute off-policy weight.
        with torch.no_grad():
            weight = self.get_ope_weight(state, action,
                                         observation.log_prob_action)

        with DisableGradient(
                self.dynamical_model,
                self.reward_model,
                self.termination_model,
                self.critic_target,
        ):
            # Compute re-parameterized policy sample.
            action = (action_mean + (action_chol @ eta).squeeze(-1)).clamp(
                -1, 1)

            # Infer xi.
            ns_mean, ns_chol = self.dynamical_model(state, action)
            with torch.no_grad():
                xi = torch.inverse(ns_chol) @ (
                    (next_state - ns_mean).unsqueeze(-1))

            # Compute re-parameterized next-state sample.
            ns = ns_mean + (ns_chol @ xi).squeeze(-1)

            # Compute reward.
            r = tensor_to_distribution(self.reward_model(state, action,
                                                         ns)).rsample()
            r = r[..., 0]

            next_v = self.value_function(ns)
            if isinstance(self.critic, NNEnsembleValueFunction) or isinstance(
                    self.critic, NNEnsembleQFunction):
                next_v = next_v[..., 0]

            v = r + self.gamma * next_v * (1 - done)

        return Loss(policy_loss=-(weight * v)).reduce(self.criterion.reduction)
コード例 #5
0
    def test_goal(self, batch_size):
        goal = random_tensor(False, 3, None)
        self.init(False, False, 4, 2, goal=goal)
        state = random_tensor(False, 4, batch_size)
        pi = tensor_to_distribution(self.policy(state))
        action = pi.sample()
        assert action.shape == torch.Size([batch_size, 2] if batch_size else [2])
        assert action.dtype is torch.get_default_dtype()

        other_goal = random_tensor(False, 3, None)
        self.policy.set_goal(other_goal)
        other_pi = tensor_to_distribution(self.policy(state))

        assert not torch.any(other_pi.mean == pi.mean)
コード例 #6
0
 def test_output_shape(self, out_dim, batch_size, num_heads, deterministic):
     in_dim = (4, )
     net = Ensemble(in_dim,
                    out_dim,
                    num_heads=num_heads,
                    deterministic=deterministic)
     if batch_size is None:
         t = torch.randn(in_dim)
         o = tensor_to_distribution(net(t)).sample()
         assert o.shape == torch.Size(out_dim)
     else:
         t = torch.randn((batch_size, ) + in_dim)
         o = tensor_to_distribution(net(t)).sample()
         assert o.shape == torch.Size((batch_size, ) + out_dim)
コード例 #7
0
    def test_class_method(self, net, batch_size, out_dim, num_heads):
        layers = [64, 64]
        in_dim = (4, )
        try:
            n1 = net(in_dim, out_dim, layers=layers, num_heads=num_heads)
        except TypeError:
            base_net = net(in_dim, out_dim, layers=layers)
            n1 = Ensemble.from_feedforward(base_net, num_heads=num_heads)

            if isinstance(base_net, DeterministicNN):
                assert n1.deterministic
            else:
                assert not n1.deterministic
        _test_from_other(n1, Ensemble)
        _test_from_other_with_copy(n1, Ensemble)

        # Test layers
        layers = layers or list()

        # Check nn.parameters (+1: head)
        assert 2 * (len(layers) + 2) == len([*n1.parameters()])

        # Check shapes
        layers.append(out_dim[0] * num_heads)
        layers.append(out_dim[0] * num_heads)
        i = 0
        for name, param in n1.named_parameters():
            if name.startswith("_scale"):
                assert param.shape[0] == out_dim[0] * num_heads  # * out_dim
            else:
                assert param.shape[0] == layers[i // 2]
                i += 1

        # Check output
        if batch_size is None:
            t = torch.randn(in_dim)
            o = tensor_to_distribution(n1(t))
            assert o.sample().shape == torch.Size(out_dim)
            assert o.batch_shape == torch.Size([])

        else:
            t = torch.randn((batch_size, 2) + in_dim)
            o = tensor_to_distribution(n1(t))
            assert o.sample().shape == torch.Size((batch_size, 2) + out_dim)
            assert o.batch_shape == torch.Size((batch_size, 2))

        assert isinstance(o, torch.distributions.MultivariateNormal)
        assert o.has_rsample
        assert not o.has_enumerate_support
コード例 #8
0
    def act(self, state):
        """Ask the agent for an action to interact with the environment."""
        if self.total_steps < self.exploration_steps or (
            self.total_episodes < self.exploration_episodes
        ):
            policy = self.policy.random()
        else:
            if not isinstance(state, torch.Tensor):
                state = torch.tensor(
                    state, dtype=torch.get_default_dtype(), device=self.device
                )
            policy = self.policy(state)

        self.pi = tensor_to_distribution(policy, **self.policy.dist_params)
        if self.training:
            action = self.pi.sample()
        elif self.pi.has_enumerate_support:
            action = torch.argmax(self.pi.probs)
        else:
            try:
                action = self.pi.mean
            except NotImplementedError:
                action = self.pi.sample((100,)).mean(dim=0)

        if not self.policy.discrete_action:
            action = action.clamp(-1.0, 1.0)
            action = self.policy.action_scale * action
        return action.detach().to("cpu").numpy()
コード例 #9
0
    def get_ope_weight(self, state, action, log_prob_action):
        """Get off-policy weight of a given transition."""
        pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params)
        _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale)

        weight = off_policy_weight(log_p, log_prob_action, full_trajectory=False)
        return weight
コード例 #10
0
def mdp2mrp(transitions, rewards, policy, terminal_states=None):
    """Transform MDP and Policy to an MRP.

    Parameters
    ----------
    transitions: Tensor.
    rewards: Tensor.
    policy: AbstractPolicy.

    Returns
    -------
    environment: MDP.
    """
    num_states, num_actions = rewards.shape
    mrp_kernel = torch.zeros((num_states, 1, num_states))
    mrp_reward = torch.zeros((num_states, 1))

    if terminal_states is None:
        terminal_states = []

    for state in range(num_states):
        if state in terminal_states:
            mrp_kernel[state, 0, state] = 1
            mrp_reward[state] = 0
            continue

        state = torch.tensor(state).long()
        policy_ = tensor_to_distribution(policy(state), **policy.dist_params)

        for action, p_action in enumerate(policy_.probs):
            for next_state, p_next_state in enumerate(transitions[state, action]):
                mrp_reward[state, 0] += p_action * p_next_state * rewards[state, action]
                mrp_kernel[state, 0, next_state] += p_action * p_next_state

    return mrp_kernel, mrp_reward
コード例 #11
0
    def test_forward(self, dim_state, dim_action, batch_size, deterministic):
        self.init(False, False, dim_state, dim_action, deterministic)
        state = random_tensor(False, dim_state, batch_size)
        distribution = tensor_to_distribution(self.policy(state))
        sample = distribution.sample()

        if deterministic:
            assert isinstance(distribution, Delta)
        else:
            assert isinstance(distribution, MultivariateNormal)

        if batch_size:
            assert distribution.mean.shape == (batch_size,) + self.dim_action
            if not deterministic:
                assert distribution.covariance_matrix.shape == (
                    batch_size,
                    self.dim_action[0],
                    self.dim_action[0],
                )
            assert sample.shape == (batch_size, dim_action)
        else:
            assert distribution.mean.shape == self.dim_action
            if not deterministic:
                assert distribution.covariance_matrix.shape == (
                    self.dim_action[0],
                    self.dim_action[0],
                )
            assert sample.shape == torch.Size((dim_action,))
コード例 #12
0
    def test_call(self, discrete_state, discrete_action, dim_state, dim_action,
                  batch_size):
        self.init(discrete_state, discrete_action, dim_state, dim_action)
        state = random_tensor(discrete_state, dim_state, batch_size)
        distribution = tensor_to_distribution(self.policy(state))
        sample = distribution.sample()

        if distribution.has_enumerate_support:  # Discrete
            assert isinstance(distribution, Categorical)
            if batch_size:
                assert distribution.logits.shape == (batch_size,
                                                     self.num_actions)
                assert sample.shape == (batch_size, )
            else:
                assert distribution.logits.shape == (self.num_actions, )
                assert sample.shape == ()
        else:  # Continuous
            assert isinstance(distribution, MultivariateNormal)
            if batch_size:
                assert distribution.mean.shape == (
                    batch_size, ) + self.dim_action
                assert distribution.covariance_matrix.shape == (
                    batch_size,
                    self.dim_action[0],
                    self.dim_action[0],
                )
                assert sample.shape == (batch_size, dim_action)
            else:
                assert distribution.mean.shape == self.dim_action
                assert distribution.covariance_matrix.shape == (
                    self.dim_action[0],
                    self.dim_action[0],
                )
                assert sample.shape == (dim_action, )
コード例 #13
0
ファイル: test_nn_policy.py プロジェクト: sebimarkgraf/rllib
    def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size):
        self.init(discrete_state, False, dim_state, dim_action)
        policy = NNPolicy.from_nn(
            HomoGaussianNN(
                self.policy.nn.kwargs["in_dim"],
                self.policy.nn.kwargs["out_dim"],
                layers=[20, 20],
                biased_head=False,
            ),
            self.dim_state,
            self.dim_action,
            num_states=self.num_states,
            num_actions=self.num_actions,
        )

        state = random_tensor(discrete_state, dim_state, batch_size)
        action = tensor_to_distribution(policy(state)).sample()
        embeddings = policy.embeddings(state)

        assert action.shape == torch.Size(
            [batch_size, dim_action] if batch_size else [dim_action])
        assert embeddings.shape == torch.Size(
            [batch_size, 20] if batch_size else [20])
        assert action.dtype is torch.get_default_dtype()
        assert embeddings.dtype is torch.get_default_dtype()
コード例 #14
0
def test_discrete(t_start, q_function):
    policy = SoftMax(q_function, t_start)
    for _ in range(100):
        state = torch.randint(4, ())
        logits = q_function(state)
        probs = torch.softmax(logits / t_start, dim=0)
        torch.testing.assert_allclose(
            tensor_to_distribution(policy(state)).probs, probs)
コード例 #15
0
    def test_random_action(self, dim_state, dim_action):
        self.init(False, False, dim_state, dim_action)

        distribution = tensor_to_distribution(self.policy.random())
        sample = distribution.sample()

        assert distribution.mean.shape == self.dim_action
        assert sample.shape == (dim_action,)
コード例 #16
0
ファイル: mpo.py プロジェクト: sebimarkgraf/rllib
 def actor_loss(self, observation):
     """Compute actor loss."""
     state = repeat_along_dimension(
         observation.state, number=self.num_samples, dim=0
     )
     pi = tensor_to_distribution(self.old_policy(state), **self.policy.dist_params)
     action = self.policy.action_scale * pi.sample().clamp(-1.0, 1.0)
     return self.compute_mpo_loss(state, action)
コード例 #17
0
ファイル: model_system.py プロジェクト: sebimarkgraf/rllib
 def step(self, action):
     """See `AbstractSystem.step'."""
     if not isinstance(action, torch.Tensor):
         action = torch.tensor(action, dtype=torch.get_default_dtype())
     state = torch.tensor(self.state, dtype=torch.get_default_dtype())
     self.state = (tensor_to_distribution(
         self.dynamical_model(state, action)).sample().numpy())
     return self.state
コード例 #18
0
ファイル: test_nn_policy.py プロジェクト: sebimarkgraf/rllib
    def test_goal(self, batch_size):
        goal = random_tensor(False, 3, None)
        policy = NNPolicy(dim_state=(4, ),
                          dim_action=(2, ),
                          layers=[32, 32],
                          goal=goal)
        state = random_tensor(False, 4, batch_size)
        pi = tensor_to_distribution(policy(state))
        action = pi.sample()
        assert action.shape == torch.Size(
            [batch_size, 2] if batch_size else [2])
        assert action.dtype is torch.get_default_dtype()

        other_goal = random_tensor(False, 3, None)
        policy.set_goal(other_goal)
        other_pi = tensor_to_distribution(policy(state))

        assert not torch.any(other_pi.mean == pi.mean)
コード例 #19
0
def test_discrete(eps_start, q_function):
    policy = EpsGreedy(q_function, eps_start)
    for _ in range(100):
        state = torch.randint(4, ())
        action = q_function(state).argmax(dim=-1)
        probs = eps_start / 2 * torch.ones(2)
        probs[action] += 1 - eps_start

        assert (tensor_to_distribution(policy(state)).probs == probs).all()
コード例 #20
0
    def step(self, action):
        """See `AbstractEnvironment.step'."""
        self._time += 1
        state = self.system.state  # this might be noisy.
        reward = float("nan")
        if self.reward is not None:
            reward = (tensor_to_distribution(self.reward(
                state, action, None)).sample().squeeze(-1))

        next_state = self.system.step(action)
        if self.termination_model is not None:
            done = (tensor_to_distribution(
                self.termination_model(state, action,
                                       next_state)).sample().squeeze(-1))
        else:
            done = False

        return next_state, reward, done, {}
コード例 #21
0
    def get_kl_entropy(self, state):
        """Get kl divergence and current policy at a given state.

        Compute the separated KL divergence between current and old policy.
        When the policy is a MultivariateNormal distribution, it compute the divergence
        that correspond to the mean and the covariance separately.

        When the policy is a Categorical distribution, it computes the divergence and
        assigns it to the mean component. The variance component is kept to zero.

        Parameters
        ----------
        state: torch.Tensor
            Empirical state distribution.

        Returns
        -------
        kl_mean: torch.Tensor
            KL-Divergence due to the change in the mean between current and
            previous policy.
        kl_var: torch.Tensor
            KL-Divergence due to the change in the variance between current and
            previous policy.
        entropy: torch.Tensor
            Entropy of the current policy at the given state.
        """
        pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params)
        pi_old = tensor_to_distribution(
            self.old_policy(state), **self.policy.dist_params
        )
        try:
            action = pi.rsample()
        except NotImplementedError:
            action = pi.sample()
        if not self.policy.discrete_action:
            action = self.policy.action_scale * (action.clamp(-1.0, 1.0))

        entropy, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale)
        _, log_p_old = get_entropy_and_log_p(pi_old, action, self.policy.action_scale)

        kl_mean, kl_var = separated_kl(p=pi_old, q=pi, log_p=log_p_old, log_q=log_p)

        return kl_mean, kl_var, entropy
コード例 #22
0
ファイル: collect_data.py プロジェクト: sebimarkgraf/rllib
def collect_model_transitions(state_dist, policy, dynamical_model,
                              reward_model, num_samples):
    """Collect transitions by interacting with an environment.

    Parameters
    ----------
    state_dist: Distribution.
        State distribution.
    policy: AbstractPolicy or Distribution.
        Policy to interact with the environment.
    dynamical_model: AbstractModel.
        Model with which to interact.
    reward_model: AbstractReward.
        Reward model with which to interact.
    num_samples: int.
        Number of transitions.

    Returns
    -------
    transitions: List[Observation]
        List of 1-step transitions.

    """
    state = state_dist.sample((num_samples, ))
    if isinstance(policy, AbstractPolicy):
        action_dist = tensor_to_distribution(policy(state),
                                             **policy.dist_params)
        action = action_dist.sample()
    else:  # action_distribution
        action_dist = policy
        action = action_dist.sample((num_samples, ))

    next_state = tensor_to_distribution(dynamical_model(state,
                                                        action)).sample()
    reward = tensor_to_distribution(reward_model(state, action,
                                                 next_state)).sample()

    transitions = []
    for state_, action_, reward_, next_state_ in zip(state, action, reward,
                                                     next_state):
        transitions.append(
            Observation(state_, action_, reward_, next_state_).to_torch())
    return transitions
コード例 #23
0
ファイル: reps.py プロジェクト: sebimarkgraf/rllib
    def _policy_weighted_nll(self, state, action, weights):
        """Return weighted policy negative log-likelihood."""
        pi = tensor_to_distribution(self.policy(state),
                                    **self.policy.dist_params)
        _, action_log_p = get_entropy_and_log_p(pi, action,
                                                self.policy.action_scale)
        weighted_log_p = weights.detach() * action_log_p

        # Clamping is crucial for stability so that it does not converge to a delta.
        log_likelihood = torch.mean(weighted_log_p.clamp_max(1e-3))
        return -log_likelihood
コード例 #24
0
def build_empirical_y0(observation, support, policy=None):
    """Build empirical distribution over samples."""
    state = observation.state
    num_states_ = state.shape[0]
    if support == "state-action":
        y0 = torch.ones(num_states_) / float(num_states_)
    elif support == "state":
        pi = tensor_to_distribution(policy(state).detach())
        y0 = pi.probs / float(num_states_)
    else:
        raise NotImplementedError(f"{support} not implemented.")
    return y0
コード例 #25
0
ファイル: rollout.py プロジェクト: sebimarkgraf/rllib
def rollout_policy(environment, policy, num_episodes=1, max_steps=1000, render=False):
    """Conduct a rollout of a policy in an environment.

    Parameters
    ----------
    environment: AbstractEnvironment
        Environment with which the policy interacts.
    policy: AbstractPolicy
        Policy that interacts with the environment.
    num_episodes: int, optional (default=1)
        Number of episodes.
    max_steps: int.
        Maximum number of steps per episode.
    render: bool.
        Flag that indicates whether to render the environment or not.

    Returns
    -------
    trajectories: List[Trajectory]=List[List[Observation]]
        A list of trajectories.

    """
    trajectories = []
    for _ in tqdm(range(num_episodes)):
        state = environment.reset()
        done = False
        trajectory = []
        with torch.no_grad():
            time_step = 0
            while not done:
                pi = tensor_to_distribution(
                    policy(torch.tensor(state, dtype=torch.get_default_dtype())),
                    **policy.dist_params,
                )
                action = pi.sample()
                if not policy.discrete_action:
                    action = policy.action_scale * action.clamp_(-1.0, 1.0)
                obs, state, done, info = step_env(
                    environment=environment,
                    state=state,
                    action=action.detach().numpy(),
                    action_scale=policy.action_scale,
                    pi=pi,
                    render=render,
                )
                trajectory.append(obs)

                time_step += 1
                if max_steps <= time_step:
                    break

        trajectories.append(trajectory)
    return trajectories
コード例 #26
0
ファイル: mpo.py プロジェクト: sebascuri/rllib
    def compute_mpo_loss(self, state, action):
        """Compute mpo loss for a given set of state/action pairs."""
        pi_dist = tensor_to_distribution(self.policy(state),
                                         **self.policy.dist_params)
        log_p = pi_dist.log_prob(action)

        q_values = self.critic_target(state, action)

        mpo_loss = self.mpo_loss(q_values=q_values, action_log_p=log_p).reduce(
            self.criterion.reduction)
        self._info.update(mpo_eta=self.mpo_loss.eta)
        return mpo_loss
コード例 #27
0
ファイル: model_learning.py プロジェクト: sebascuri/rllib
def train_exact_gp_type2mll_step(model, observation, optimizer):
    """Train a GP using type-2 Marginal-Log-Likelihood optimization."""
    optimizer.zero_grad()
    output = tensor_to_distribution(
        model(observation.state[:, 0], observation.action[:, 0]))
    with gpytorch.settings.fast_pred_var():
        val = torch.stack(tuple([gp.train_targets for gp in model.gp]), 0)
        loss = exact_mll(output, val, model.gp)
    loss.backward()
    optimizer.step()
    model.eval()
    return loss
コード例 #28
0
ファイル: test_nn_policy.py プロジェクト: sebimarkgraf/rllib
 def test_input_transform(self, batch_size):
     policy = NNPolicy(
         dim_state=(2, ),
         dim_action=(4, ),
         layers=[64, 64],
         input_transform=StateTransform(),
     )
     out = tensor_to_distribution(
         policy(random_tensor(False, 2, batch_size)))
     action = out.sample()
     assert action.shape == torch.Size(
         [batch_size, 4] if batch_size else [4])
     assert action.dtype is torch.get_default_dtype()
コード例 #29
0
    def test_random_action(self, discrete_state, discrete_action, dim_state,
                           dim_action):
        self.init(discrete_state, discrete_action, dim_state, dim_action)

        distribution = tensor_to_distribution(self.policy.random())
        sample = distribution.sample()

        if distribution.has_enumerate_support:  # Discrete
            assert distribution.logits.shape == (self.num_actions, )
            assert sample.shape == ()
        else:  # Continuous
            assert distribution.mean.shape == self.dim_action
            assert sample.shape == (dim_action, )
コード例 #30
0
    def test_output_properties(self, net, in_dim, out_dim, batch_size):
        net = torch.jit.script(net(in_dim, out_dim))
        if batch_size is None:
            t = torch.randn(in_dim)
        else:
            t = torch.randn((batch_size, 2) + in_dim)

        o = tensor_to_distribution(net(t))
        assert isinstance(o, torch.distributions.MultivariateNormal)
        assert o.has_rsample
        assert not o.has_enumerate_support
        assert o.batch_shape == torch.Size((
            batch_size, 2) if batch_size is not None else ())