Exemple #1
0
    def value(self, state, action):
        state_var = to_tensor_var([state], self.use_cuda)
        action_var = to_tensor_var([action], self.use_cuda)
        # whole_state_var = state_var.view(-1, self.n_agents*self.state_dim)
        # whole_action_var = action_var.view(-1, self.n_agents*self.action_dim)
        # whole_critic_state_dim = 0
        # whole_critic_action_dim = 0
        # for i in range(self.n_agents):
        #     whole_critic_state_dim += self.obs_shape_n[i]
        #     whole_critic_action_dim += self.act_shape_n[i]
        whole_state_var = state_var.view(-1, self.whole_critic_state_dim)
        whole_action_var = action_var.view(-1, self.whole_critic_action_dim)

        values = [0] * self.n_agents
        for agent_id in range(self.n_agents):
            if self.training_strategy == "cocurrent":
                value_var = self.critics[agent_id](state_var[:, agent_id, :],
                                                   action_var[:, agent_id, :])
            elif self.training_strategy == "centralized":
                value_var = self.critics[agent_id](whole_state_var,
                                                   whole_action_var)
            if self.use_cuda:
                values[agent_id] = value_var.data.cpu().numpy()[0]
            else:
                values[agent_id] = value_var.data.numpy()[0]
        return values
Exemple #2
0
    def get_loss(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(
            batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(
            one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # Get the actor network loss
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg

        # Get the critic network loss
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)

        combined_loss = {'actor_loss': actor_loss,
                         'critic_loss': critic_loss}

        return combined_loss
Exemple #3
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
        actions_var = to_tensor_var(batch.actions, self.use_cuda, "long").view(-1, 1)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
        next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)
        dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

        # compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        current_q = self.actor(states_var).gather(1, actions_var)

        # compute V(s_{t+1}) for all next states and all actions,
        # and we then take max_a { V(s_{t+1}) }
        next_state_action_values = self.actor(next_states_var).detach()
        next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)
        # compute target q by: r + gamma * max_a { V(s_{t+1}) }
        target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. - dones_var)

        # update value network
        self.actor_optimizer.zero_grad()
        if self.critic_loss == "huber":
            loss = th.nn.functional.smooth_l1_loss(current_q, target_q)
        else:
            loss = th.nn.MSELoss()(current_q, target_q)
        loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()
Exemple #4
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
Exemple #5
0
 def value(self, state, action):
     state_var = to_tensor_var([state], self.use_cuda)
     action = index_to_one_hot(action, self.action_dim)
     action_var = to_tensor_var([action], self.use_cuda)
     value_var = self.critic(state_var, action_var)
     if self.use_cuda:
         value = value_var.data.cpu().numpy()[0]
     else:
         value = value_var.data.numpy()[0]
     return value
Exemple #6
0
    def train(self):
        # do not train until exploration is enough
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        state_var = to_tensor_var(batch.states,
                                  self.use_cuda).view(-1, self.state_dim)
        action_var = to_tensor_var(batch.actions,
                                   self.use_cuda).view(-1, self.action_dim)
        reward_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
        next_state_var = to_tensor_var(batch.next_states,
                                       self.use_cuda).view(-1, self.state_dim)
        done_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

        # estimate the target q with actor_target network and critic_target network
        next_action_var = self.actor_target(next_state_var)
        next_q = self.critic_target(next_state_var, next_action_var).detach()
        target_q = self.reward_scale * reward_var + self.reward_gamma * next_q * (
            1. - done_var)

        # update critic network
        self.critic_optimizer.zero_grad()
        # current Q values
        current_q = self.critic(state_var, action_var)
        # rewards is target Q values
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(current_q, target_q)
        else:
            critic_loss = nn.MSELoss()(current_q, target_q)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

        # update actor network
        self.actor_optimizer.zero_grad()
        # the accurate action prediction
        action = self.actor(state_var)
        # actor_loss is used to maximize the Q value for the predicted action
        actor_loss = -self.critic(state_var, action)
        actor_loss = actor_loss.mean()
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update actor target network and critic target network
        if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0:
            super(DDPG, self)._soft_update_target(self.critic_target,
                                                  self.critic)
            super(DDPG, self)._soft_update_target(self.actor_target,
                                                  self.actor)
Exemple #7
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        values = self.critic_target(states_var, actions_var).detach()
        advantages = rewards_var - values
        # # normalizing advantages seems not working correctly here
        # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
        action_log_probs = self.actor(states_var)
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        old_action_log_probs = self.actor_target(states_var).detach()
        old_action_log_probs = th.sum(old_action_log_probs * actions_var, 1)
        ratio = th.exp(action_log_probs - old_action_log_probs)
        surr1 = ratio * advantages
        surr2 = th.clamp(ratio, 1.0 - self.clip_param,
                         1.0 + self.clip_param) * advantages
        # PPO's pessimistic surrogate (L^CLIP)
        actor_loss = -th.mean(th.min(surr1, surr2))
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        values = self.critic(states_var, actions_var)
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

        # update actor target network and critic target network
        if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0:
            super(PPO, self)._soft_update_target(self.actor_target, self.actor)
            super(PPO, self)._soft_update_target(self.critic_target,
                                                 self.critic)
Exemple #8
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.n_agents,
                                                       self.state_dim)
        actions_var = to_tensor_var(batch.actions, self.use_cuda).view(
            -1, self.n_agents, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards,
                                    self.use_cuda).view(-1, self.n_agents, 1)
        whole_states_var = states_var.view(-1, self.n_agents * self.state_dim)
        whole_actions_var = actions_var.view(-1,
                                             self.n_agents * self.action_dim)

        for agent_id in range(self.n_agents):
            # update actor network
            self.actor_optimizers[agent_id].zero_grad()
            action_log_probs = self.actors[agent_id](states_var[:,
                                                                agent_id, :])
            entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
            action_log_probs = th.sum(
                action_log_probs * actions_var[:, agent_id, :], 1)
            if self.training_strategy == "cocurrent":
                values = self.critics[agent_id](states_var[:, agent_id, :],
                                                actions_var[:, agent_id, :])
            elif self.training_strategy == "centralized":
                values = self.critics[agent_id](whole_states_var,
                                                whole_actions_var)
            advantages = rewards_var[:, agent_id, :] - values.detach()
            pg_loss = -th.mean(action_log_probs * advantages)
            actor_loss = pg_loss - entropy_loss * self.entropy_reg
            actor_loss.backward()
            if self.max_grad_norm is not None:
                nn.utils.clip_grad_norm(self.actors[agent_id].parameters(),
                                        self.max_grad_norm)
            self.actor_optimizers[agent_id].step()

            # update critic network
            self.critic_optimizers[agent_id].zero_grad()
            target_values = rewards_var[:, agent_id, :]
            if self.critic_loss == "huber":
                critic_loss = nn.functional.smooth_l1_loss(
                    values, target_values)
            else:
                critic_loss = nn.MSELoss()(values, target_values)
            critic_loss.backward()
            if self.max_grad_norm is not None:
                nn.utils.clip_grad_norm(self.critics[agent_id].parameters(),
                                        self.max_grad_norm)
            self.critic_optimizers[agent_id].step()
Exemple #9
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        action_log_probs, values = self.actor_critic(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        # fisher loss
        if self.optimizer.steps % self.optimizer.Ts == 0:
            self.actor_critic.zero_grad()
            pg_fisher_loss = th.mean(action_log_probs)
            values_noise = to_tensor_var(np.random.randn(values.size()[0]),
                                         self.use_cuda)
            sample_values = (values + values_noise.view(-1, 1)).detach()
            if self.critic_loss == "huber":
                vf_fisher_loss = -nn.functional.smooth_l1_loss(
                    values, sample_values)
            else:
                vf_fisher_loss = -nn.MSELoss()(values, sample_values)
            joint_fisher_loss = pg_fisher_loss + self.vf_fisher_coef * vf_fisher_loss
            self.optimizer.acc_stats = True
            joint_fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False
        self.optimizer.zero_grad()
        # actor loss
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        # critic loss
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        loss = actor_loss + critic_loss
        loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor_critic.parameters(),
                                    self.max_grad_norm)
        self.optimizer.step()
Exemple #10
0
 def action(self, state):
     action_var = self.actor(to_tensor_var([state], self.use_cuda))
     if self.use_cuda:
         action = action_var.data.cpu().numpy()[0]
     else:
         action = action_var.data.numpy()[0]
     return action
Exemple #11
0
 def value(self, state, action):
     state_var = to_tensor_var([state], self.use_cuda)
     value_var = self.actor_critic(state_var)[1]
     if self.use_cuda:
         value = value_var.data.cpu().numpy()[0]
     else:
         value = value_var.data.numpy()[0]
     return value
Exemple #12
0
 def _softmax_action(self, state):
     state_var = to_tensor_var([state], self.use_cuda)
     softmax_action_var = th.exp(self.actor(state_var))
     if self.use_cuda:
         softmax_action = softmax_action_var.data.cpu().numpy()[0]
     else:
         softmax_action = softmax_action_var.data.numpy()[0]
     return softmax_action
Exemple #13
0
 def action(self, state):
     state_var = to_tensor_var([state], self.use_cuda)
     state_action_value_var = self.actor(state_var)
     if self.use_cuda:
         state_action_value = state_action_value_var.data.cpu().numpy()[0]
     else:
         state_action_value = state_action_value_var.data.numpy()[0]
     action = np.argmax(state_action_value)
     return action
Exemple #14
0
 def _softmax_action(self, state):
     state_var = to_tensor_var([state], self.use_cuda)
     softmax_action = np.zeros((self.n_agents, self.action_dim), dtype=np.float64)
     for agent_id in range(self.n_agents):
         softmax_action_var = self.actors[agent_id](state_var[:,agent_id,:])
         if self.use_cuda:
             softmax_action[agent_id] = softmax_action_var.data.cpu().numpy()[0]
         else:
             softmax_action[agent_id] = softmax_action_var.data.numpy()[0]
     return softmax_action
Exemple #15
0
 def value(self, state, action):
     # print([state])
     # try:
     state_var = to_tensor_var([state], self.use_cuda)
     action_var = to_tensor_var([action], self.use_cuda)
     # except ValueError as e:
     #     print([state])
     #     print([action])
     whole_state_var = state_var.view(-1, self.n_agents*self.obs_shape_n)
     whole_action_var = action_var.view(-1, self.n_agents*self.act_shape_n)
     values = [0]*self.n_agents
     for agent_id in range(self.n_agents):
         if self.training_strategy == "cocurrent":
             value_var = self.critics[agent_id](state_var[:,agent_id,:], action_var[:,agent_id,:])
         elif self.training_strategy == "centralized":
             value_var = self.critics[agent_id](whole_state_var, whole_action_var)
         if self.use_cuda:
             values[agent_id] = value_var.data.cpu().numpy()[0]
         else:
             values[agent_id] = value_var.data.numpy()[0]
     return values
Exemple #16
0
 def _softmax_action(self, state):
     try:
         state_var = to_tensor_var([state], self.use_cuda)
     except ValueError as e:
         print([state])
         sys.exit(0)
     softmax_action = np.zeros((self.n_agents, self.act_shape_n), dtype=np.float64)
     for agent_id in range(self.n_agents):
         softmax_action_var = th.exp(self.actors[agent_id](state_var[:,agent_id,:]))
         if self.use_cuda:
             softmax_action[agent_id] = softmax_action_var.data.cpu().numpy()[0]
         else:
             softmax_action[agent_id] = softmax_action_var.data.numpy()[0]
     return softmax_action
Exemple #17
0
    def _softmax_action(self, state):
        # print(state)
        state_var = to_tensor_var([state], self.use_cuda)
        softmax_action = np.zeros((self.n_agents, self.act_shape_n[0]),
                                  dtype=np.float64)
        # softmax_action = np.zeros((self.n_agents, self.action_dim), dtype=np.float64)
        # softmax_action = np.array([])
        # print(self.act_shape_n[0])
        # for i in range(self.n_agents):
        #     print(np.zeros(self.act_shape_n[i]))
        #     np.vstack((softmax_action,np.zeros(self.act_shape_n[i])))

        for agent_id in range(self.n_agents):
            softmax_action_var = th.exp(self.actors[agent_id](
                state_var[:, agent_id, :]))
            if self.use_cuda:
                softmax_action[agent_id] = softmax_action_var.data.cpu().numpy(
                )[0]
            else:
                softmax_action[agent_id] = softmax_action_var.data.numpy()[0]
        return softmax_action