def generalised_advantage_estimate(n_step_summary,
                                   discount_factor=0.99,
                                   tau=0.95,
                                   device='cpu'):
    '''
compute GAE(lambda) advantages and discounted returns

  :param n_step_summary:
  :param device:
:param use_cuda:
:type use_cuda:
:param signals:
:type signals:
:param value_estimates:
:type value_estimates:
:param non_terminals:
:type non_terminals:
:param discount_factor:
:type discount_factor:
:param tau:
:type tau:
:return:
:rtype:
'''

    signals = U.to_tensor(n_step_summary.signal,
                          device=device,
                          dtype=torch.float)

    non_terminals = U.to_tensor(n_step_summary.non_terminal,
                                device=device,
                                dtype=torch.float)

    value_estimates = U.to_tensor(n_step_summary.value_estimate,
                                  device=device,
                                  dtype=torch.float)

    T = signals.size()
    T = T[0]
    num_workers = 1
    # T,num_workers,_  = signals.size()

    advs = torch.zeros(T, num_workers, 1).to(device)
    advantage_now = torch.zeros(num_workers, 1).to(device)

    for t in reversed(range(T - 1)):
        signal_now = signals[t]
        value_future = value_estimates[t + 1]
        value_now = value_estimates[t]
        non_terminal_now = non_terminals[t]

        td_error = signal_now + value_future * discount_factor * non_terminal_now - value_now

        advantage_now = advantage_now * discount_factor * tau * non_terminal_now + td_error

        advs[t] = advantage_now

    advantages = advs.squeeze()

    return advantages
Esempio n. 2
0
  def _sample_model(self, state, **kwargs):
    model_input = U.to_tensor([state], device=self._device, dtype=self._state_type)

    with torch.no_grad():
      action_value_estimates = self._value_model(model_input)
    max_value_action_idx = action_value_estimates.max(1)[1].item()
    return max_value_action_idx
Esempio n. 3
0
    def __defaults__(self) -> None:

        self._policy_arch = U.CategoricalMLP
        self._accumulated_error = U.to_tensor(0.0, device=self._device)
        self._evaluation_function = torch.nn.CrossEntropyLoss()
        self._trajectory_trace = U.TrajectoryTraceBuffer()

        self._policy_arch_params = U.ConciseArchSpecification(
            **{
                'input_size': None,  # Obtain from environment
                'hidden_layers': [64, 32, 16],
                'output_size': None,  # Obtain from environment
                'activation': F.relu,
                'use_bias': True,
            })

        self._use_cuda = False
        self._discount_factor = 0.99
        self._use_batched_updates = False
        self._batch_size = 5
        self._pg_entropy_reg = 1e-4
        self._signal_clipping = False

        self._optimiser_learning_rate = 1e-4
        self._optimiser_type = torch.optim.Adam
        self._optimiser_weight_decay = 1e-5

        self._state_type = torch.float
        self._signals_tensor_type = torch.float
Esempio n. 4
0
    def evaluate(self, **kwargs):
        R = 0
        policy_loss = []
        signals = []

        trajectory = self._trajectory_trace.retrieve_trajectory()
        t_signal = trajectory.signal
        log_probs = trajectory.log_prob
        entrp = trajectory.entropy
        self._trajectory_trace.clear()

        for r in t_signal[::-1]:
            R = r + self._discount_factor * R
            signals.insert(0, R)

        signals = U.to_tensor(signals,
                              device=self._device,
                              dtype=self._signals_tensor_type)

        if signals.shape[0] > 1:
            stddev = signals.std()
            signals = (signals -
                       signals.mean()) / (stddev + self._divide_by_zero_safety)

        for log_prob, signal, entropy in zip(log_probs, signals, entrp):
            policy_loss.append(-log_prob * signal -
                               self._pg_entropy_reg * entropy)

        loss = torch.cat(policy_loss).sum()
        return loss
Esempio n. 5
0
  def trace_back_steps(self, transitions):
    n_step_summary = U.ValuedTransition(*zip(*transitions))

    advantages = U.generalised_advantage_estimate(n_step_summary, self._discount_factor, tau=self._gae_tau)

    value_estimates = U.to_tensor(n_step_summary.value_estimate, device=self._device, dtype=torch.float)

    discounted_returns = value_estimates + advantages

    i = 0
    advantage_memories = []
    for step in zip(*n_step_summary):
      step = U.ValuedTransition(*step)
      advantage_memories.append(
          U.AdvantageMemory(
              step.state,
              step.action,
              step.action_prob,
              step.value_estimate,
              advantages[i],
              discounted_returns[i],
              )
          )
      i += 1

    return advantage_memories
Esempio n. 6
0
  def evaluate(self, batch, *args, **kwargs):
    '''

:param batch:
:type batch:
:return:
:rtype:
'''
    states = U.to_tensor(batch.state, dtype=self._state_type, device=self._device) \
      .view(-1, *self._input_size)

    action_indices = U.to_tensor(batch.action, dtype=self._action_type, device=self._device) \
      .view(-1, 1)
    true_signals = U.to_tensor(batch.signal, dtype=self._value_type, device=self._device).view(-1, 1)

    non_terminal_mask = U.to_tensor(batch.non_terminal, dtype=torch.uint8, device=self._device)
    nts = [state for (state, non_terminal_mask) in zip(batch.successor_state, batch.non_terminal) if
           non_terminal_mask]
    non_terminal_successors = U.to_tensor(nts, dtype=self._state_type, device=self._device) \
      .view(-1, *self._input_size)

    if not len(non_terminal_successors) > 0:
      return 0  # Nothing to be learned, all states are terminal

    # Calculate Q of successors
    with torch.no_grad():
      Q_successors = self._value_model(non_terminal_successors)
    Q_successors_max_action_indices = Q_successors.max(1)[1].view(-1, 1)
    if self._use_double_dqn:
      with torch.no_grad():
        Q_successors = self._target_value_model(non_terminal_successors)
    Q_max_successor = torch.zeros(
      self._batch_size, dtype=self._value_type, device=self._device
      )
    Q_max_successor[non_terminal_mask] = Q_successors.gather(
      1, Q_successors_max_action_indices
      ).squeeze()

    # Integrate with the true signal
    Q_expected = true_signals + (self._discount_factor * Q_max_successor).view(
      -1, 1
      )

    # Calculate Q of state
    Q_state = self._value_model(states).gather(1, action_indices)

    return self._evaluation_function(Q_state, Q_expected)
Esempio n. 7
0
 def _sample_model(self, state, **kwargs):
     state_tensor = U.to_tensor([state],
                                device=self._device,
                                dtype=self._state_type)
     with torch.no_grad():
         probs = self._policy(state_tensor)
     m = Categorical(probs)
     action = m.sample()
     return action.item()
Esempio n. 8
0
  def evaluate(
      self,
      state_batch,
      action_batch,
      signal_batch,
      next_state_batch,
      non_terminal_batch,
      *args,
      **kwargs,
      ):
    '''

:type kwargs: object
'''
    states = U.to_tensor(state_batch, device=self._device, dtype=self._state_type) \
      .view(-1, self._input_size[0])
    next_states = U.to_tensor(next_state_batch, device=self._device, dtype=self._state_type) \
      .view(-1, self._input_size[0])
    actions = U.to_tensor(action_batch, device=self._device, dtype=self._action_type) \
      .view(-1, self._output_size[0])
    signals = U.to_tensor(signal_batch, device=self._device, dtype=self._value_type)

    non_terminal_mask = U.to_tensor(non_terminal_batch, device=self._device, dtype=torch.float)

    ### Critic ###
    # Compute current Q value, critic takes state and action chosen
    Q_current = self._critic(states, actions)
    # Compute next Q value based on which action target actor would choose
    # Detach variable from the current graph since we don't want gradients for next Q to propagated
    with torch.no_grad():
      target_actions = self._target_actor(states)
      next_max_q = self._target_critic(next_states, target_actions).max(1)[0]

    next_Q_values = non_terminal_mask * next_max_q

    Q_target = signals + (self._discount_factor * next_Q_values)  # Compute the target of the current Q values

    td_error = self._evaluation_function(Q_current,
                                         Q_target.view(-1, 1))  # Compute Bellman error (using Huber loss)

    return td_error, states
Esempio n. 9
0
    def update(self, *args, **kwargs):
        error = self.evaluate()

        if error is not None:
            if self._use_batched_updates:
                self._accumulated_error += error
                if self._rollout_i % self._batch_size == 0:
                    self._optimise_wrt(self._accumulated_error /
                                       self._batch_size)
                    self._accumulated_error = U.to_tensor(0.0,
                                                          device=self._device)
            else:
                self._optimise_wrt(error)
Esempio n. 10
0
    def sample_discrete_action(self, state):
        state_var = U.to_tensor([state],
                                device=self._device,
                                dtype=self._state_type)

        probs = self._policy(state_var)

        # action = np.argmax(probs)

        m = Categorical(probs)
        action_sample = m.sample()
        action = action_sample.item()

        return action, m.log_prob(action_sample), m.entropy()
Esempio n. 11
0
def plot_durations(episode_durations):
    plt.figure(2)
    plt.clf()
    durations_t = U.to_tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())
Esempio n. 12
0
  def _sample_model(self, state, continuous=True, **kwargs):
    '''

continuous
  randomly sample from normal distribution, whose mean and variance come from policy network.
  [batch, action_size]

:param state:
:type state:
:param continuous:
:type continuous:
:param kwargs:
:type kwargs:
:return:
:rtype:
'''

    model_input = U.to_tensor([state], device=self._device, dtype=self._state_type)

    if continuous:
      with torch.no_grad():
        action_mean, action_log_std, value_estimate = self._actor_critic(model_input)

        action_log_std = action_log_std.expand_as(action_mean)
        action_std = torch.exp(action_log_std)
        action = torch.normal(action_mean, action_std)

        a = action.to('cpu').numpy()[0]
      return a, value_estimate, action_log_std
    else:

      softmax_probs, value_estimate = self._actor_critic(model_input)
      # action = torch.multinomial(softmax_probs)
      m = Categorical(softmax_probs)
      action = m.sample()
      a = action.to('cpu').data.numpy()[0]
      return a, value_estimate, m.log_prob(action)
Esempio n. 13
0
    def sample_continuous_action(self, state):
        model_input = U.to_tensor([state],
                                  device=self._device,
                                  dtype=self._state_type)

        with torch.no_grad():
            mu, sigma_sq = self._policy(model_input)

            mu, sigma_sq = mu[0], sigma_sq[0]

        # std = self.sigma.exp().expand_as(mu)
        # dist = torch.Normal(mu, std)
        # return dist, value

        eps = torch.randn(mu.size())
        # calculate the probability
        action = (mu + sigma_sq.sqrt() * eps).data
        prob = U.normal(action, mu, sigma_sq)
        entropy = -0.5 * (
            (sigma_sq +
             2 * U.pi_torch(self._device).expand_as(sigma_sq)).log() + 1)

        log_prob = prob.log()
        return action, log_prob, entropy
Esempio n. 14
0
  def evaluate(self, batch, discrete=False, **kwargs):

    states = U.to_tensor(batch.state, device=self._device, dtype=torch.float).view(-1, self._input_size[0])

    value_estimates = U.to_tensor(batch.value_estimate, device=self._device, dtype=torch.float)

    advantages = U.to_tensor(batch.advantage, device=self._device, dtype=torch.float)

    discounted_returns = U.to_tensor(batch.discounted_return, device=self._device, dtype=torch.float)

    value_error = (value_estimates - discounted_returns).pow(2).mean()

    advantage = (advantages - advantages.mean()) / (advantages.std() + self._divide_by_zero_safety)

    action_probs = U.to_tensor(batch.action_prob, device=self._device, dtype=torch.float) \
      .view(-1, self._output_size[0])
    _, _, action_probs_target, *_ = self._actor_critic_target(states)

    if discrete:
      actions = U.to_tensor(batch.action, device=self._device, dtype=torch.float) \
        .view(-1, self._output_size[0])
      action_probs = action_probs.gather(1, actions)
      action_probs_target = action_probs_target.gather(1, actions)

    ratio = torch.exp(action_probs - action_probs_target)

    surrogate = ratio * advantage

    clamped_ratio = torch.clamp(ratio, min=1. - self._surrogate_clip, max=1. + self._surrogate_clip)
    surrogate_clipped = clamped_ratio * advantage  # (L^CLIP)

    policy_loss = -torch.min(surrogate, surrogate_clipped).mean()

    entropy_loss = U.entropy(action_probs).mean()

    collective_cost = policy_loss + value_error * self._value_reg_coef + entropy_loss * self._entropy_reg_coef

    return collective_cost, policy_loss, value_error
Esempio n. 15
0
 def _sample_model(self, state, **kwargs):
   state = U.to_tensor([state], device=self._device, dtype=self._state_type)
   with torch.no_grad():
     action = self._actor(state)
   a = action.to('cpu').numpy()
   return a[0]
Esempio n. 16
0
 def infer(self, state, **kwargs):
   model_input = U.to_tensor([state], device=self._device, dtype=self._state_type)
   with torch.no_grad():
     value = self._value_model(model_input)
   return value