def generalised_advantage_estimate(n_step_summary, discount_factor=0.99, tau=0.95, device='cpu'): ''' compute GAE(lambda) advantages and discounted returns :param n_step_summary: :param device: :param use_cuda: :type use_cuda: :param signals: :type signals: :param value_estimates: :type value_estimates: :param non_terminals: :type non_terminals: :param discount_factor: :type discount_factor: :param tau: :type tau: :return: :rtype: ''' signals = U.to_tensor(n_step_summary.signal, device=device, dtype=torch.float) non_terminals = U.to_tensor(n_step_summary.non_terminal, device=device, dtype=torch.float) value_estimates = U.to_tensor(n_step_summary.value_estimate, device=device, dtype=torch.float) T = signals.size() T = T[0] num_workers = 1 # T,num_workers,_ = signals.size() advs = torch.zeros(T, num_workers, 1).to(device) advantage_now = torch.zeros(num_workers, 1).to(device) for t in reversed(range(T - 1)): signal_now = signals[t] value_future = value_estimates[t + 1] value_now = value_estimates[t] non_terminal_now = non_terminals[t] td_error = signal_now + value_future * discount_factor * non_terminal_now - value_now advantage_now = advantage_now * discount_factor * tau * non_terminal_now + td_error advs[t] = advantage_now advantages = advs.squeeze() return advantages
def _sample_model(self, state, **kwargs): model_input = U.to_tensor([state], device=self._device, dtype=self._state_type) with torch.no_grad(): action_value_estimates = self._value_model(model_input) max_value_action_idx = action_value_estimates.max(1)[1].item() return max_value_action_idx
def __defaults__(self) -> None: self._policy_arch = U.CategoricalMLP self._accumulated_error = U.to_tensor(0.0, device=self._device) self._evaluation_function = torch.nn.CrossEntropyLoss() self._trajectory_trace = U.TrajectoryTraceBuffer() self._policy_arch_params = U.ConciseArchSpecification( **{ 'input_size': None, # Obtain from environment 'hidden_layers': [64, 32, 16], 'output_size': None, # Obtain from environment 'activation': F.relu, 'use_bias': True, }) self._use_cuda = False self._discount_factor = 0.99 self._use_batched_updates = False self._batch_size = 5 self._pg_entropy_reg = 1e-4 self._signal_clipping = False self._optimiser_learning_rate = 1e-4 self._optimiser_type = torch.optim.Adam self._optimiser_weight_decay = 1e-5 self._state_type = torch.float self._signals_tensor_type = torch.float
def evaluate(self, **kwargs): R = 0 policy_loss = [] signals = [] trajectory = self._trajectory_trace.retrieve_trajectory() t_signal = trajectory.signal log_probs = trajectory.log_prob entrp = trajectory.entropy self._trajectory_trace.clear() for r in t_signal[::-1]: R = r + self._discount_factor * R signals.insert(0, R) signals = U.to_tensor(signals, device=self._device, dtype=self._signals_tensor_type) if signals.shape[0] > 1: stddev = signals.std() signals = (signals - signals.mean()) / (stddev + self._divide_by_zero_safety) for log_prob, signal, entropy in zip(log_probs, signals, entrp): policy_loss.append(-log_prob * signal - self._pg_entropy_reg * entropy) loss = torch.cat(policy_loss).sum() return loss
def trace_back_steps(self, transitions): n_step_summary = U.ValuedTransition(*zip(*transitions)) advantages = U.generalised_advantage_estimate(n_step_summary, self._discount_factor, tau=self._gae_tau) value_estimates = U.to_tensor(n_step_summary.value_estimate, device=self._device, dtype=torch.float) discounted_returns = value_estimates + advantages i = 0 advantage_memories = [] for step in zip(*n_step_summary): step = U.ValuedTransition(*step) advantage_memories.append( U.AdvantageMemory( step.state, step.action, step.action_prob, step.value_estimate, advantages[i], discounted_returns[i], ) ) i += 1 return advantage_memories
def evaluate(self, batch, *args, **kwargs): ''' :param batch: :type batch: :return: :rtype: ''' states = U.to_tensor(batch.state, dtype=self._state_type, device=self._device) \ .view(-1, *self._input_size) action_indices = U.to_tensor(batch.action, dtype=self._action_type, device=self._device) \ .view(-1, 1) true_signals = U.to_tensor(batch.signal, dtype=self._value_type, device=self._device).view(-1, 1) non_terminal_mask = U.to_tensor(batch.non_terminal, dtype=torch.uint8, device=self._device) nts = [state for (state, non_terminal_mask) in zip(batch.successor_state, batch.non_terminal) if non_terminal_mask] non_terminal_successors = U.to_tensor(nts, dtype=self._state_type, device=self._device) \ .view(-1, *self._input_size) if not len(non_terminal_successors) > 0: return 0 # Nothing to be learned, all states are terminal # Calculate Q of successors with torch.no_grad(): Q_successors = self._value_model(non_terminal_successors) Q_successors_max_action_indices = Q_successors.max(1)[1].view(-1, 1) if self._use_double_dqn: with torch.no_grad(): Q_successors = self._target_value_model(non_terminal_successors) Q_max_successor = torch.zeros( self._batch_size, dtype=self._value_type, device=self._device ) Q_max_successor[non_terminal_mask] = Q_successors.gather( 1, Q_successors_max_action_indices ).squeeze() # Integrate with the true signal Q_expected = true_signals + (self._discount_factor * Q_max_successor).view( -1, 1 ) # Calculate Q of state Q_state = self._value_model(states).gather(1, action_indices) return self._evaluation_function(Q_state, Q_expected)
def _sample_model(self, state, **kwargs): state_tensor = U.to_tensor([state], device=self._device, dtype=self._state_type) with torch.no_grad(): probs = self._policy(state_tensor) m = Categorical(probs) action = m.sample() return action.item()
def evaluate( self, state_batch, action_batch, signal_batch, next_state_batch, non_terminal_batch, *args, **kwargs, ): ''' :type kwargs: object ''' states = U.to_tensor(state_batch, device=self._device, dtype=self._state_type) \ .view(-1, self._input_size[0]) next_states = U.to_tensor(next_state_batch, device=self._device, dtype=self._state_type) \ .view(-1, self._input_size[0]) actions = U.to_tensor(action_batch, device=self._device, dtype=self._action_type) \ .view(-1, self._output_size[0]) signals = U.to_tensor(signal_batch, device=self._device, dtype=self._value_type) non_terminal_mask = U.to_tensor(non_terminal_batch, device=self._device, dtype=torch.float) ### Critic ### # Compute current Q value, critic takes state and action chosen Q_current = self._critic(states, actions) # Compute next Q value based on which action target actor would choose # Detach variable from the current graph since we don't want gradients for next Q to propagated with torch.no_grad(): target_actions = self._target_actor(states) next_max_q = self._target_critic(next_states, target_actions).max(1)[0] next_Q_values = non_terminal_mask * next_max_q Q_target = signals + (self._discount_factor * next_Q_values) # Compute the target of the current Q values td_error = self._evaluation_function(Q_current, Q_target.view(-1, 1)) # Compute Bellman error (using Huber loss) return td_error, states
def update(self, *args, **kwargs): error = self.evaluate() if error is not None: if self._use_batched_updates: self._accumulated_error += error if self._rollout_i % self._batch_size == 0: self._optimise_wrt(self._accumulated_error / self._batch_size) self._accumulated_error = U.to_tensor(0.0, device=self._device) else: self._optimise_wrt(error)
def sample_discrete_action(self, state): state_var = U.to_tensor([state], device=self._device, dtype=self._state_type) probs = self._policy(state_var) # action = np.argmax(probs) m = Categorical(probs) action_sample = m.sample() action = action_sample.item() return action, m.log_prob(action_sample), m.entropy()
def plot_durations(episode_durations): plt.figure(2) plt.clf() durations_t = U.to_tensor(episode_durations, dtype=torch.float) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Duration') plt.plot(durations_t.numpy()) # Take 100 episode averages and plot them too if len(durations_t) >= 100: means = durations_t.unfold(0, 100, 1).mean(1).view(-1) means = torch.cat((torch.zeros(99), means)) plt.plot(means.numpy()) plt.pause(0.001) # pause a bit so that plots are updated if is_ipython: display.clear_output(wait=True) display.display(plt.gcf())
def _sample_model(self, state, continuous=True, **kwargs): ''' continuous randomly sample from normal distribution, whose mean and variance come from policy network. [batch, action_size] :param state: :type state: :param continuous: :type continuous: :param kwargs: :type kwargs: :return: :rtype: ''' model_input = U.to_tensor([state], device=self._device, dtype=self._state_type) if continuous: with torch.no_grad(): action_mean, action_log_std, value_estimate = self._actor_critic(model_input) action_log_std = action_log_std.expand_as(action_mean) action_std = torch.exp(action_log_std) action = torch.normal(action_mean, action_std) a = action.to('cpu').numpy()[0] return a, value_estimate, action_log_std else: softmax_probs, value_estimate = self._actor_critic(model_input) # action = torch.multinomial(softmax_probs) m = Categorical(softmax_probs) action = m.sample() a = action.to('cpu').data.numpy()[0] return a, value_estimate, m.log_prob(action)
def sample_continuous_action(self, state): model_input = U.to_tensor([state], device=self._device, dtype=self._state_type) with torch.no_grad(): mu, sigma_sq = self._policy(model_input) mu, sigma_sq = mu[0], sigma_sq[0] # std = self.sigma.exp().expand_as(mu) # dist = torch.Normal(mu, std) # return dist, value eps = torch.randn(mu.size()) # calculate the probability action = (mu + sigma_sq.sqrt() * eps).data prob = U.normal(action, mu, sigma_sq) entropy = -0.5 * ( (sigma_sq + 2 * U.pi_torch(self._device).expand_as(sigma_sq)).log() + 1) log_prob = prob.log() return action, log_prob, entropy
def evaluate(self, batch, discrete=False, **kwargs): states = U.to_tensor(batch.state, device=self._device, dtype=torch.float).view(-1, self._input_size[0]) value_estimates = U.to_tensor(batch.value_estimate, device=self._device, dtype=torch.float) advantages = U.to_tensor(batch.advantage, device=self._device, dtype=torch.float) discounted_returns = U.to_tensor(batch.discounted_return, device=self._device, dtype=torch.float) value_error = (value_estimates - discounted_returns).pow(2).mean() advantage = (advantages - advantages.mean()) / (advantages.std() + self._divide_by_zero_safety) action_probs = U.to_tensor(batch.action_prob, device=self._device, dtype=torch.float) \ .view(-1, self._output_size[0]) _, _, action_probs_target, *_ = self._actor_critic_target(states) if discrete: actions = U.to_tensor(batch.action, device=self._device, dtype=torch.float) \ .view(-1, self._output_size[0]) action_probs = action_probs.gather(1, actions) action_probs_target = action_probs_target.gather(1, actions) ratio = torch.exp(action_probs - action_probs_target) surrogate = ratio * advantage clamped_ratio = torch.clamp(ratio, min=1. - self._surrogate_clip, max=1. + self._surrogate_clip) surrogate_clipped = clamped_ratio * advantage # (L^CLIP) policy_loss = -torch.min(surrogate, surrogate_clipped).mean() entropy_loss = U.entropy(action_probs).mean() collective_cost = policy_loss + value_error * self._value_reg_coef + entropy_loss * self._entropy_reg_coef return collective_cost, policy_loss, value_error
def _sample_model(self, state, **kwargs): state = U.to_tensor([state], device=self._device, dtype=self._state_type) with torch.no_grad(): action = self._actor(state) a = action.to('cpu').numpy() return a[0]
def infer(self, state, **kwargs): model_input = U.to_tensor([state], device=self._device, dtype=self._state_type) with torch.no_grad(): value = self._value_model(model_input) return value