def get_loss(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var( batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var( one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # Get the actor network loss self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # Get the critic network loss self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) combined_loss = {'actor_loss': actor_loss, 'critic_loss': critic_loss} return combined_loss
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step()
def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) action = index_to_one_hot(action, self.action_dim) action_var = to_tensor_var([action], self.use_cuda) value_var = self.critic(state_var, action_var) if self.use_cuda: value = value_var.data.cpu().numpy()[0] else: value = value_var.data.numpy()[0] return value
def interact(self): if (self.max_steps is not None) and (self.n_steps >= self.max_steps): self.env_state = self.env.reset() self.n_steps = 0 states = [] actions = [] rewards = [] # take n steps for i in range(self.roll_out_n_steps): states.append(self.env_state) action = self.exploration_action(self.env_state) next_state, reward, done, _ = self.env.step(action) done = done[0] actions.append( [index_to_one_hot(a, self.action_dim) for a in action]) rewards.append(reward) final_state = next_state self.env_state = next_state if done: self.env_state = self.env.reset() break # discount reward if done: final_r = [0.0] * self.n_agents self.n_episodes += 1 self.episode_done = True else: self.episode_done = False final_action = self.action(final_state) one_hot_action = [ index_to_one_hot(a, self.action_dim) for a in final_action ] final_r = self.value(final_state, one_hot_action) rewards = np.array(rewards) for agent_id in range(self.n_agents): rewards[:, agent_id] = self._discount_reward(rewards[:, agent_id], final_r[agent_id]) rewards = rewards.tolist() self.n_steps += 1 self.memory.push(states, actions, rewards)
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() values = self.critic_target(states_var, actions_var).detach() advantages = rewards_var - values # # normalizing advantages seems not working correctly here # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) action_log_probs = self.actor(states_var) action_log_probs = th.sum(action_log_probs * actions_var, 1) old_action_log_probs = self.actor_target(states_var).detach() old_action_log_probs = th.sum(old_action_log_probs * actions_var, 1) ratio = th.exp(action_log_probs - old_action_log_probs) surr1 = ratio * advantages surr2 = th.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages # PPO's pessimistic surrogate (L^CLIP) actor_loss = -th.mean(th.min(surr1, surr2)) actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var values = self.critic(states_var, actions_var) if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # update actor target network and critic target network if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0: super(PPO, self)._soft_update_target(self.actor_target, self.actor) super(PPO, self)._soft_update_target(self.critic_target, self.critic)
def eval_action(self, state): actions = [0] * self.n_agents one_hot_actions = [] softmax_actions = self._softmax_action(state) # print(softmax_actions) for agent_id in range(self.n_agents): actions[agent_id] = np.argmax(softmax_actions[agent_id]) one_hot_actions.append( index_to_one_hot(actions[agent_id], self.act_shape_n[agent_id])) return one_hot_actions
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network action_log_probs, values = self.actor_critic(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) # fisher loss if self.optimizer.steps % self.optimizer.Ts == 0: self.actor_critic.zero_grad() pg_fisher_loss = th.mean(action_log_probs) values_noise = to_tensor_var(np.random.randn(values.size()[0]), self.use_cuda) sample_values = (values + values_noise.view(-1, 1)).detach() if self.critic_loss == "huber": vf_fisher_loss = -nn.functional.smooth_l1_loss( values, sample_values) else: vf_fisher_loss = -nn.MSELoss()(values, sample_values) joint_fisher_loss = pg_fisher_loss + self.vf_fisher_coef * vf_fisher_loss self.optimizer.acc_stats = True joint_fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() # actor loss advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # critic loss target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) loss = actor_loss + critic_loss loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step()
def interact(self): if (self.max_steps is not None) and (self.n_steps >= self.max_steps): self.env_state, self.info_state = self.env.reset() self.env_state = self._normalize_state(self.env_state) self.n_steps = 0 states = [] actions = [] rewards = [] terminal = False if type(self.env_state) is dict: self.env_state = self._normalize_state(self.env_state) # take n steps for i in range(self.roll_out_n_steps): assert type(self.env_state) is not dict, f"{self.n_steps}" states.append(self.env_state) action = self.exploration_action(self.env_state) next_state, reward, done, info = self.env.step(action) # next_state = self._normalize_state(next_state) agent_obs = [None] * self.n_agents for agent in self.env.get_agent_handles(): if not info['action_required'][agent] or done[agent]: agent_obs[agent] = self.env_state[agent] continue if next_state[agent]: agent_obs[agent] = normalize_observation(next_state[agent], 3, 30) else: agent_obs[agent] = self.env_state[agent] next_state = agent_obs done = done['__all__'] # actions.append([index_to_one_hot(a, self.act_shape_n) for a in action]) actions.append([index_to_one_hot(a, self.act_shape_n) for a in action]) # print(reward) rewards.append(reward) final_state = next_state self.env_state = next_state if done: self.env_state, self.info_state = self.env.reset() self.env_state = self._normalize_state(self.env_state) self.n_steps = 0 break # for displaying learned policies # time.sleep(0.1) # self.env.render() # discount reward if done: final_r = [0.0] * self.n_agents self.n_episodes += 1 self.episode_done = True # print("done") else: one_hot_action = [] self.episode_done = False final_action = self.action(final_state) one_hot_action = [index_to_one_hot(a, self.act_shape_n) for a in final_action] final_r = self.value(final_state, one_hot_action) new_rewards = [] for reward in rewards: new_reward = [] for agent_id in range(self.n_agents): new_reward.append(reward[agent_id]) new_rewards.append(new_reward) rewards = np.array(new_rewards) for agent_id in range(self.n_agents): rewards[:,agent_id] = self._discount_reward(rewards[:,agent_id], final_r[agent_id]) rewards = rewards.tolist() # print(rewards) self.n_steps += 1 self.memory.push(states, actions, rewards)
def interact(self): states = [] actions = [] rewards = [] terminal = False # take n steps for i in range(self.roll_out_n_steps): states.append(self.env_state) action = self.exploration_action(self.env_state) one_hot_actions = [] for agent_id in range(0, self.n_agents): one_hot_actions.append( index_to_one_hot(action[agent_id], self.act_shape_n[agent_id])) # print(action) # print(one_hot_actions) next_state, reward, done, _ = self.env.step(one_hot_actions) done = done[0] # actions.append([index_to_one_hot(a, self.action_dim) for a in action]) agents_act = [] for agent_id in range(self.n_agents): agents_act.append( index_to_one_hot(action[agent_id], self.act_shape_n[agent_id])) actions.append(agents_act) # print(reward) rewards.append(reward) final_state = next_state self.env_state = next_state if (self.max_steps is not None) and (self.n_steps >= self.max_steps): terminal = True if done or terminal: self.env_state = self.env.reset() self.n_steps = 0 break # for displaying learned policies # time.sleep(0.1) # self.env.render() # discount reward if done or terminal: final_r = [0.0] * self.n_agents self.n_episodes += 1 self.episode_done = True # print("done") else: one_hot_action = [] self.episode_done = False final_action = self.action(final_state) # one_hot_action = [index_to_one_hot(a, self.action_dim) for a in final_action] for agent_id in range(self.n_agents): one_hot_action.append( index_to_one_hot(action[agent_id], self.act_shape_n[agent_id])) final_r = self.value(final_state, one_hot_action) rewards = np.array(rewards) for agent_id in range(self.n_agents): rewards[:, agent_id] = self._discount_reward(rewards[:, agent_id], final_r[agent_id]) rewards = rewards.tolist() # print(rewards) self.n_steps += 1 self.memory.push(states, actions, rewards)