def learn(self, experiences): """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to(self.device) dones = to_tensor(experiences['done']).type(torch.int).to(self.device) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to(self.device) assert rewards.shape == dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.state_size) assert actions.shape == (self.batch_size, self.action_size) indices = None if hasattr(self.buffer, 'priority_update'): # When using PER buffer indices = experiences['index'] loss_critic = self.compute_value_loss(states, actions, next_states, rewards, dones, indices) # Value (critic) optimization self.critic_optimizer.zero_grad() loss_critic.backward() nn.utils.clip_grad_norm_(self.actor_params, self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = float(loss_critic.item()) # Policy (actor) optimization loss_actor = self.compute_policy_loss(states) self.actor_optimizer.zero_grad() loss_actor.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = float(loss_actor.item()) # Networks gradual sync soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau)
def step(self, state, action, reward, next_state, done) -> None: """Letting the agent to take a step. On some steps the agent will initiate learning step. This is dependent on the `update_freq` value. Parameters: state: S(t) action: A(t) reward: R(t) nexxt_state: S(t+1) done: (bool) Whether the state is terminal. """ self.iteration += 1 state = to_tensor(self.state_transform(state)).float().to("cpu") next_state = to_tensor(self.state_transform(next_state)).float().to("cpu") reward = self.reward_transform(reward) # Delay adding to buffer to account for n_steps (particularly the reward) self.n_buffer.add(state=state.numpy(), action=[int(action)], reward=[reward], done=[done], next_state=next_state.numpy()) if not self.n_buffer.available: return self.buffer.add(**self.n_buffer.get().get_dict()) if self.iteration < self.warm_up: return if len(self.buffer) >= self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): self.learn(self.buffer.sample()) # Update networks only once - sync local & target soft_update(self.target_net, self.net, self.tau)
def learn(self, experiences: Dict[str, List]) -> None: """ Parameters: experiences: Contains all experiences for the agent. Typically sampled from the memory buffer. Five keys are expected, i.e. `state`, `action`, `reward`, `next_state`, `done`. Each key contains a array and all arrays have to have the same length. """ rewards = to_tensor(experiences['reward']).float().to(self.device) dones = to_tensor(experiences['done']).type(torch.int).to(self.device) states = to_tensor(experiences['state']).float().to(self.device) next_states = to_tensor(experiences['next_state']).float().to(self.device) actions = to_tensor(experiences['action']).type(torch.long).to(self.device) assert rewards.shape == dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.state_size) assert actions.shape == (self.batch_size, 1) # Discrete domain with torch.no_grad(): prob_next = self.target_net.act(next_states) q_next = (prob_next * self.z_atoms).sum(-1) * self.z_delta if self.using_double_q: duel_prob_next = self.net.act(next_states) a_next = torch.argmax((duel_prob_next * self.z_atoms).sum(-1), dim=-1) else: a_next = torch.argmax(q_next, dim=-1) prob_next = prob_next[self.__batch_indices, a_next, :] m = self.net.dist_projection(rewards, 1 - dones, self.gamma ** self.n_steps, prob_next) assert m.shape == (self.batch_size, self.num_atoms) log_prob = self.net(states, log_prob=True) assert log_prob.shape == (self.batch_size, self.action_size, self.num_atoms) log_prob = log_prob[self.__batch_indices, actions.squeeze(), :] assert log_prob.shape == m.shape == (self.batch_size, self.num_atoms) # Cross-entropy loss error and the loss is batch mean error = -torch.sum(m * log_prob, 1) assert error.shape == (self.batch_size,) loss = error.mean() assert loss >= 0 self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() self._loss = float(loss.item()) if hasattr(self.buffer, 'priority_update'): assert (~torch.isnan(error)).any() self.buffer.priority_update(experiences['index'], error.detach().cpu().numpy()) # Update networks - sync local & target soft_update(self.target_net, self.net, self.tau)
def test_to_tensor_numpy(): # Assign int_l = np.array([0, 1, 2, 3]) float_l = np.random.random(4) float_l = np.random.random(4) # Act int_t = to_tensor(int_l) float_t = to_tensor(float_l) # Assert assert torch.equal(torch.tensor(int_l), int_t) assert torch.equal(torch.tensor(float_l), float_t)
def learn(self, experiences, agent_name: str) -> None: """update the critics and actors of all the agents """ # TODO: Just look at this mess. agent_number = list(self.agents).index(agent_name) agent_rewards = to_tensor(experiences['reward']).select(1, agent_number).unsqueeze(-1).float().to(self.device) agent_dones = to_tensor(experiences['done']).select(1, agent_number).unsqueeze(-1).type(torch.int).to(self.device) states = to_tensor(experiences['state']).to(self.device).view(self.batch_size, self.num_agents, self.state_size) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to(self.device).view(self.batch_size, self.num_agents, self.state_size) flat_states = states.view(-1, self.num_agents*self.state_size) flat_next_states = next_states.view(-1, self.num_agents*self.state_size) flat_actions = actions.view(-1, self.num_agents*self.action_size) assert agent_rewards.shape == agent_dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.num_agents, self.state_size) assert actions.shape == (self.batch_size, self.num_agents, self.action_size) assert flat_actions.shape == (self.batch_size, self.num_agents*self.action_size) agent = self.agents[agent_name] next_actions = actions.detach().clone() next_actions.data[:, agent_number] = agent.target_actor(next_states[:, agent_number, :]) assert next_actions.shape == (self.batch_size, self.num_agents, self.action_size) # critic loss Q_target_next = self.target_critic(flat_next_states, self.__flatten_actions(next_actions)) Q_target = agent_rewards + (self.gamma * Q_target_next * (1 - agent_dones)) Q_expected = self.critic(flat_states, flat_actions) loss_critic = F.mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() loss_critic.backward() if self.gradient_clip: nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip) self.critic_optimizer.step() self._loss_critic = float(loss_critic.mean().item()) # Compute actor loss pred_actions = actions.detach().clone() # pred_actions.data[:, agent_number] = agent.actor(flat_states) pred_actions.data[:, agent_number] = agent.actor(states[:, agent_number, :]) loss_actor = -self.critic(flat_states, self.__flatten_actions(pred_actions)).mean() agent.actor_optimizer.zero_grad() loss_actor.backward() agent.actor_optimizer.step() self._loss_actor[agent_name] = loss_actor.mean().item()
def act(self, state, epsilon: float = 0.): actions = [] logprobs = [] values = [] state = to_tensor(state).view(self.num_workers, self.state_size).float().to(self.device) for worker in range(self.num_workers): actor_est = self.actor.act(state[worker].unsqueeze(0)) assert not torch.any(torch.isnan(actor_est)) dist = self.policy(actor_est) action = dist.sample() value = self.critic.act( state[worker].unsqueeze(0)) # Shape: (1, 1) logprob = self.policy.log_prob(dist, action) # Shape: (1,) values.append(value) logprobs.append(logprob) if self.is_discrete: # *Technically* it's the max of Softmax but that's monotonic. action = int(torch.argmax(action)) else: action = torch.clamp(action * self.action_scale, self.action_min, self.action_max) action = action.cpu().numpy().flatten().tolist() actions.append(action) self.local_memory_buffer['value'] = torch.cat(values) self.local_memory_buffer['logprob'] = torch.stack(logprobs) assert len(actions) == self.num_workers return actions if self.num_workers > 1 else actions[0]
def target_act(self, staten, noise: float = 0.0): with torch.no_grad(): staten = to_tensor(staten).float().to(self.device) action = self.target_actor(staten) + noise * self.noise.sample() return torch.clamp(action, self.action_min, self.action_max).cpu().numpy().astype( np.float32)
def act(self, state, epsilon: float = 0.) -> List[float]: """ Returns actions for given state as per current policy. Parameters: state: Current available state from the environment. epislon: Epsilon value in the epislon-greedy policy. """ actions = [] state = to_tensor(state).view(self.num_workers, self.state_size).float().to(self.device) for worker in range(self.num_workers): if self._rng.random() < epsilon: action = self.action_scale * (torch.rand(self.action_size) - 0.5) else: action_seed = self.actor.act(state[worker].view(1, -1)) action_dist = self.policy(action_seed) action = action_dist.sample() action *= self.action_scale action = torch.clamp(action.squeeze(), self.action_min, self.action_max).cpu() actions.append(action.tolist()) assert len(actions) == self.num_workers return actions
def act(self, state, epsilon: float = 0.0) -> List[float]: """ Returns actions for given state as per current policy. Parameters: state: Current available state from the environment. epislon: Epsilon value in the epislon-greedy policy. """ state = to_tensor(state).float().to(self.device) if self._rng.random() < epsilon: action = self.action_scale * (torch.rand(self.action_size) - 0.5) else: action_seed = self.actor.act(state).view(1, -1) action_dist = self.policy(action_seed) action = action_dist.sample() action *= self.action_scale action = action.squeeze() # Purely for logging self._display_dist = self.target_critic.act( state, action.to(self.device)).squeeze().cpu() self._display_dist = F.softmax(self._display_dist, dim=0) return torch.clamp(action, self.action_min, self.action_max).cpu().tolist()
def test_to_tensor_list_tesors(): # Assign int_l = [torch.tensor([0, 1, 2, 3]), torch.tensor([1, 2, 5, 10])] float_l = [ torch.tensor([0.5, 1.1, 2.9, 3.0]), torch.tensor([0.1, 0.1, 0.1, 10.0]) ] # Act int_t = to_tensor(int_l) float_t = to_tensor(float_l) # Assert assert torch.equal(torch.stack(int_l), int_t) assert torch.equal(torch.stack(float_l), float_t) assert int_t.shape == (2, 4) assert float_t.shape == (2, 4)
def test_to_tensor_tensor(): # Assign t = torch.tensor([0, 1, 2, 3]) # Act new_t = to_tensor(t) # Assert assert torch.equal(new_t, t)
def test_to_tensor_list(): # Assign int_l = [0, 1, 2, 3] float_l = [0.5, 1.1, 2.9, 3.0] int_l_shape = [list(range(4 * i, 4 * (i + 1))) for i in range(5)] # Act int_t = to_tensor(int_l) float_t = to_tensor(float_l) int_t_shape = to_tensor(int_l_shape) # Assert assert torch.equal(torch.tensor(int_l), int_t) assert torch.equal(torch.tensor(float_l), float_t) assert torch.equal(torch.tensor(int_l_shape), int_t_shape) assert int_t.shape == (4, ) assert float_t.shape == (4, ) assert int_t_shape.shape == (5, 4)
def learn(self, experiences): """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to( self.device).unsqueeze(1) dones = to_tensor(experiences['done']).type(torch.int).to( self.device).unsqueeze(1) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to( self.device) if (self.iteration % self.update_freq) == 0: self._update_value_function(states, actions, rewards, next_states, dones) if (self.iteration % self.update_policy_freq) == 0: self._update_policy(states) soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau)
def act(self, obs, noise: float = 0.0) -> List[float]: """Acting on the observations. Returns action. Returns: action: (list float) Action values. """ obs = to_tensor(obs).float().to(self.device) action = self.actor(obs) action += noise * self.noise.sample() action = torch.clamp(action * self.action_scale, self.action_min, self.action_max) return action.cpu().numpy().tolist()
def learn(self, samples): """update the critics and actors of all the agents """ rewards = to_tensor(samples['reward']).float().to(self.device).view( self.batch_size, 1) dones = to_tensor(samples['done']).int().to(self.device).view( self.batch_size, 1) states = to_tensor(samples['state']).float().to(self.device).view( self.batch_size, self.state_size) next_states = to_tensor(samples['next_state']).float().to( self.device).view(self.batch_size, self.state_size) actions = to_tensor(samples['action']).to(self.device).view( self.batch_size, self.action_size) # Critic (value) update for _ in range(self.critic_number_updates): value_loss, error = self.compute_value_loss( states, actions, rewards, next_states, dones) self.critic_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic_params, self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = value_loss.item() # Actor (policy) update for _ in range(self.actor_number_updates): policy_loss = self.compute_policy_loss(states) self.actor_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(self.actor_params, self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = policy_loss.item() if hasattr(self.memory, 'priority_update'): assert any(~torch.isnan(error)) self.memory.priority_update(samples['index'], error.abs()) soft_update(self.target_double_critic, self.double_critic, self.tau)
def train(self): """ Main loop that initiates the training. """ experiences = self.buffer.all_samples() rewards = to_tensor(experiences['reward']).to(self.device) dones = to_tensor(experiences['done']).type(torch.int).to(self.device) states = to_tensor(experiences['state']).to(self.device) actions = to_tensor(experiences['action']).to(self.device) values = to_tensor(experiences['value']).to(self.device) logprobs = to_tensor(experiences['logprob']).to(self.device) assert rewards.shape == dones.shape == values.shape == logprobs.shape assert states.shape == ( self.rollout_length, self.num_workers, self.state_size), f"Wrong states shape: {states.shape}" assert actions.shape == ( self.rollout_length, self.num_workers, self.action_size), f"Wrong action shape: {actions.shape}" with torch.no_grad(): if self.using_gae: next_value = self.critic.act(states[-1]) advantages = compute_gae(rewards, dones, values, next_value, self.gamma, self.gae_lambda) advantages = normalize(advantages) returns = advantages + values # returns = normalize(advantages + values) assert advantages.shape == returns.shape == values.shape else: returns = revert_norm_returns(rewards, dones, self.gamma) returns = returns.float() advantages = normalize(returns - values) assert advantages.shape == returns.shape == values.shape for _ in range(self.num_epochs): idx = 0 self.kl_div = 0 while idx < self.rollout_length: _states = states[idx:idx + self.batch_size].view( -1, self.state_size).detach() _actions = actions[idx:idx + self.batch_size].view( -1, self.action_size).detach() _logprobs = logprobs[idx:idx + self.batch_size].view( -1, 1).detach() _returns = returns[idx:idx + self.batch_size].view(-1, 1).detach() _advantages = advantages[idx:idx + self.batch_size].view( -1, 1).detach() idx += self.batch_size self.learn( (_states, _actions, _logprobs, _returns, _advantages)) self.kl_div = abs( self.kl_div) / (self.actor_number_updates * self.rollout_length / self.batch_size) if self.kl_div > self.target_kl * 1.75: self.kl_beta = min(2 * self.kl_beta, 1e2) # Max 100 if self.kl_div < self.target_kl / 1.75: self.kl_beta = max(0.5 * self.kl_beta, 1e-6) # Min 0.000001 self._metrics['policy/kl_beta'] = self.kl_beta
def learn(self, experiences: Dict[str, list]) -> None: """Updates agent's networks based on provided experience. Parameters: experiences: Samples experiences from the experience buffer. """ rewards = to_tensor(experiences['reward']).type(torch.float32).to( self.device) dones = to_tensor(experiences['done']).type(torch.int).to(self.device) states = to_tensor(experiences['state']).type(torch.float32).to( self.device) next_states = to_tensor(experiences['next_state']).type( torch.float32).to(self.device) actions = to_tensor(experiences['action']).type(torch.long).to( self.device) with torch.no_grad(): Q_targets_next = self.target_net.act(next_states).detach() if self.using_double_q: _a = torch.argmax(self.net(next_states), dim=-1).unsqueeze(-1) max_Q_targets_next = Q_targets_next.gather(1, _a) else: max_Q_targets_next = Q_targets_next.max(1)[0].unsqueeze(1) Q_targets = rewards + self.n_buffer.n_gammas[ -1] * max_Q_targets_next * (1 - dones) Q_expected: torch.Tensor = self.net(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() self._loss = float(loss.item()) if hasattr(self.buffer, 'priority_update'): error = Q_expected - Q_targets assert any(~torch.isnan(error)) self.buffer.priority_update(experiences['index'], error.abs())
def learn(self, experiences) -> None: """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to( self.device).unsqueeze(1) dones = to_tensor(experiences['done']).type(torch.int).to( self.device).unsqueeze(1) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to( self.device) assert rewards.shape == dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.state_size) assert actions.shape == (self.batch_size, self.action_size) # Value (critic) optimization loss_critic = self.compute_value_loss(states, actions, next_states, rewards, dones) self.critic_optimizer.zero_grad() loss_critic.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = float(loss_critic.item()) # Policy (actor) optimization loss_actor = self.compute_policy_loss(states) self.actor_optimizer.zero_grad() loss_actor.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = loss_actor.item() # Soft update target weights soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau)
def act(self, state, epsilon: float = 0.0, deterministic=False) -> List[float]: if self.iteration < self.warm_up or self._rng.random() < epsilon: random_action = torch.rand(self.action_size) * ( self.action_max + self.action_min) + self.action_min return random_action.cpu().tolist() state = to_tensor(state).view(1, self.state_size).float().to(self.device) proto_action = self.actor(state) action = self.policy(proto_action, deterministic) return action.flatten().tolist()
def act(self, state, eps: float = 0.) -> int: """ Returns actions for given state as per current policy. Parameters: state: Current available state from the environment. epislon: Epsilon value in the epislon-greedy policy. """ # Epsilon-greedy action selection if self._rng.random() < eps: return self._rng.randint(0, self.action_size-1) state = to_tensor(self.state_transform(state)).float().unsqueeze(0).to(self.device) # state = to_tensor(self.state_transform(state)).float().to(self.device) self.dist_probs = self.net.act(state) q_values = (self.dist_probs * self.z_atoms).sum(-1) return int(q_values.argmax(-1)) # Action maximizes state-action value Q(s, a)
def act(self, state, eps: float = 0.) -> int: """Returns actions for given state as per current policy. Parameters: state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection Returns: Categorical value for the action. """ # Epsilon-greedy action selection if self._rng.random() < eps: return self._rng.randint(0, self.action_size - 1) state = to_tensor(self.state_transform(state)).float() state = state.unsqueeze(0).to(self.device) action_values = self.net.act(state) return int(torch.argmax(action_values.cpu()))
def act(self, state, epsilon: float = 0.0, training_mode=True) -> List[float]: """ Agent acting on observations. When the training_mode is True (default) a noise is added to each action. """ # Epsilon greedy if self._rng.random() < epsilon: rnd_actions = torch.rand(self.action_size) * ( self.action_max - self.action_min) - self.action_min return rnd_actions.tolist() with torch.no_grad(): state = to_tensor(state).float().to(self.device) action = self.actor(state) if training_mode: action += self.noise.sample() return (self.action_scale * torch.clamp(action, self.action_min, self.action_max)).tolist()