Ejemplo n.º 1
0
    def get_loss(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(
            batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(
            one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # Get the actor network loss
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg

        # Get the critic network loss
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)

        combined_loss = {'actor_loss': actor_loss,
                         'critic_loss': critic_loss}

        return combined_loss
Ejemplo n.º 2
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        action_log_probs = self.actor(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        values = self.critic(states_var, actions_var)
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm)
        self.critic_optimizer.step()
Ejemplo n.º 3
0
 def value(self, state, action):
     state_var = to_tensor_var([state], self.use_cuda)
     action = index_to_one_hot(action, self.action_dim)
     action_var = to_tensor_var([action], self.use_cuda)
     value_var = self.critic(state_var, action_var)
     if self.use_cuda:
         value = value_var.data.cpu().numpy()[0]
     else:
         value = value_var.data.numpy()[0]
     return value
Ejemplo n.º 4
0
    def interact(self):
        if (self.max_steps is not None) and (self.n_steps >= self.max_steps):
            self.env_state = self.env.reset()
            self.n_steps = 0
        states = []
        actions = []
        rewards = []
        # take n steps
        for i in range(self.roll_out_n_steps):
            states.append(self.env_state)
            action = self.exploration_action(self.env_state)
            next_state, reward, done, _ = self.env.step(action)
            done = done[0]
            actions.append(
                [index_to_one_hot(a, self.action_dim) for a in action])
            rewards.append(reward)
            final_state = next_state
            self.env_state = next_state
            if done:
                self.env_state = self.env.reset()
                break
        # discount reward
        if done:
            final_r = [0.0] * self.n_agents
            self.n_episodes += 1
            self.episode_done = True
        else:
            self.episode_done = False
            final_action = self.action(final_state)
            one_hot_action = [
                index_to_one_hot(a, self.action_dim) for a in final_action
            ]
            final_r = self.value(final_state, one_hot_action)

        rewards = np.array(rewards)
        for agent_id in range(self.n_agents):
            rewards[:,
                    agent_id] = self._discount_reward(rewards[:, agent_id],
                                                      final_r[agent_id])
        rewards = rewards.tolist()
        self.n_steps += 1
        self.memory.push(states, actions, rewards)
Ejemplo n.º 5
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        self.actor_optimizer.zero_grad()
        values = self.critic_target(states_var, actions_var).detach()
        advantages = rewards_var - values
        # # normalizing advantages seems not working correctly here
        # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
        action_log_probs = self.actor(states_var)
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        old_action_log_probs = self.actor_target(states_var).detach()
        old_action_log_probs = th.sum(old_action_log_probs * actions_var, 1)
        ratio = th.exp(action_log_probs - old_action_log_probs)
        surr1 = ratio * advantages
        surr2 = th.clamp(ratio, 1.0 - self.clip_param,
                         1.0 + self.clip_param) * advantages
        # PPO's pessimistic surrogate (L^CLIP)
        actor_loss = -th.mean(th.min(surr1, surr2))
        actor_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor.parameters(),
                                    self.max_grad_norm)
        self.actor_optimizer.step()

        # update critic network
        self.critic_optimizer.zero_grad()
        target_values = rewards_var
        values = self.critic(states_var, actions_var)
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        critic_loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.critic.parameters(),
                                    self.max_grad_norm)
        self.critic_optimizer.step()

        # update actor target network and critic target network
        if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0:
            super(PPO, self)._soft_update_target(self.actor_target, self.actor)
            super(PPO, self)._soft_update_target(self.critic_target,
                                                 self.critic)
Ejemplo n.º 6
0
    def eval_action(self, state):
        actions = [0] * self.n_agents
        one_hot_actions = []

        softmax_actions = self._softmax_action(state)
        # print(softmax_actions)
        for agent_id in range(self.n_agents):
            actions[agent_id] = np.argmax(softmax_actions[agent_id])
            one_hot_actions.append(
                index_to_one_hot(actions[agent_id],
                                 self.act_shape_n[agent_id]))
        return one_hot_actions
Ejemplo n.º 7
0
    def train(self):
        if self.n_episodes <= self.episodes_before_train:
            pass

        batch = self.memory.sample(self.batch_size)
        states_var = to_tensor_var(batch.states,
                                   self.use_cuda).view(-1, self.state_dim)
        one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
        actions_var = to_tensor_var(one_hot_actions,
                                    self.use_cuda).view(-1, self.action_dim)
        rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

        # update actor network
        action_log_probs, values = self.actor_critic(states_var)
        entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
        action_log_probs = th.sum(action_log_probs * actions_var, 1)
        # fisher loss
        if self.optimizer.steps % self.optimizer.Ts == 0:
            self.actor_critic.zero_grad()
            pg_fisher_loss = th.mean(action_log_probs)
            values_noise = to_tensor_var(np.random.randn(values.size()[0]),
                                         self.use_cuda)
            sample_values = (values + values_noise.view(-1, 1)).detach()
            if self.critic_loss == "huber":
                vf_fisher_loss = -nn.functional.smooth_l1_loss(
                    values, sample_values)
            else:
                vf_fisher_loss = -nn.MSELoss()(values, sample_values)
            joint_fisher_loss = pg_fisher_loss + self.vf_fisher_coef * vf_fisher_loss
            self.optimizer.acc_stats = True
            joint_fisher_loss.backward(retain_graph=True)
            self.optimizer.acc_stats = False
        self.optimizer.zero_grad()
        # actor loss
        advantages = rewards_var - values.detach()
        pg_loss = -th.mean(action_log_probs * advantages)
        actor_loss = pg_loss - entropy_loss * self.entropy_reg
        # critic loss
        target_values = rewards_var
        if self.critic_loss == "huber":
            critic_loss = nn.functional.smooth_l1_loss(values, target_values)
        else:
            critic_loss = nn.MSELoss()(values, target_values)
        loss = actor_loss + critic_loss
        loss.backward()
        if self.max_grad_norm is not None:
            nn.utils.clip_grad_norm(self.actor_critic.parameters(),
                                    self.max_grad_norm)
        self.optimizer.step()
Ejemplo n.º 8
0
    def interact(self):
        if (self.max_steps is not None) and (self.n_steps >= self.max_steps):
            self.env_state, self.info_state = self.env.reset()
            self.env_state = self._normalize_state(self.env_state)
            self.n_steps = 0

        states = []
        actions = []
        rewards = []
        terminal = False

        if type(self.env_state) is dict:
            self.env_state = self._normalize_state(self.env_state)
        
        # take n steps
        for i in range(self.roll_out_n_steps):
            assert type(self.env_state) is not dict,  f"{self.n_steps}"
            states.append(self.env_state)
            action = self.exploration_action(self.env_state)
            next_state, reward, done, info = self.env.step(action)
            # next_state = self._normalize_state(next_state)

            agent_obs = [None] * self.n_agents
            for agent in self.env.get_agent_handles():
                if not info['action_required'][agent] or done[agent]:
                    agent_obs[agent] = self.env_state[agent]
                    continue
                if next_state[agent]:
                    agent_obs[agent] = normalize_observation(next_state[agent], 3, 30)
                else:
                    agent_obs[agent] = self.env_state[agent]
            
            next_state = agent_obs

            done = done['__all__']
            # actions.append([index_to_one_hot(a, self.act_shape_n) for a in action])
            actions.append([index_to_one_hot(a, self.act_shape_n) for a in action])

            # print(reward)
            rewards.append(reward)
            final_state = next_state

            self.env_state = next_state

            if done:
                self.env_state, self.info_state = self.env.reset()
                self.env_state = self._normalize_state(self.env_state)
                self.n_steps = 0
                break

        # for displaying learned policies
        # time.sleep(0.1)
        # self.env.render()
        # discount reward
        if done:
            final_r = [0.0] * self.n_agents
            self.n_episodes += 1
            self.episode_done = True
            # print("done")
        else:
            one_hot_action = []
            self.episode_done = False
            final_action = self.action(final_state)
            one_hot_action = [index_to_one_hot(a, self.act_shape_n) for a in final_action]
            final_r = self.value(final_state, one_hot_action)

        new_rewards = []
        for reward in rewards:
            new_reward = []
            for agent_id in range(self.n_agents):
                new_reward.append(reward[agent_id])
            new_rewards.append(new_reward)

        rewards = np.array(new_rewards)

        for agent_id in range(self.n_agents):
            rewards[:,agent_id] = self._discount_reward(rewards[:,agent_id], final_r[agent_id])

        rewards = rewards.tolist()
        # print(rewards)
        self.n_steps += 1
        self.memory.push(states, actions, rewards)
Ejemplo n.º 9
0
    def interact(self):
        states = []
        actions = []
        rewards = []
        terminal = False
        # take n steps
        for i in range(self.roll_out_n_steps):
            states.append(self.env_state)
            action = self.exploration_action(self.env_state)

            one_hot_actions = []
            for agent_id in range(0, self.n_agents):
                one_hot_actions.append(
                    index_to_one_hot(action[agent_id],
                                     self.act_shape_n[agent_id]))
            # print(action)
            # print(one_hot_actions)

            next_state, reward, done, _ = self.env.step(one_hot_actions)
            done = done[0]
            # actions.append([index_to_one_hot(a, self.action_dim) for a in action])
            agents_act = []
            for agent_id in range(self.n_agents):
                agents_act.append(
                    index_to_one_hot(action[agent_id],
                                     self.act_shape_n[agent_id]))
            actions.append(agents_act)
            # print(reward)
            rewards.append(reward)
            final_state = next_state
            self.env_state = next_state

            if (self.max_steps
                    is not None) and (self.n_steps >= self.max_steps):
                terminal = True
            if done or terminal:
                self.env_state = self.env.reset()
                self.n_steps = 0
                break

        # for displaying learned policies
        # time.sleep(0.1)
        # self.env.render()
        # discount reward
        if done or terminal:
            final_r = [0.0] * self.n_agents
            self.n_episodes += 1
            self.episode_done = True
            # print("done")
        else:
            one_hot_action = []
            self.episode_done = False
            final_action = self.action(final_state)
            # one_hot_action = [index_to_one_hot(a, self.action_dim) for a in final_action]
            for agent_id in range(self.n_agents):
                one_hot_action.append(
                    index_to_one_hot(action[agent_id],
                                     self.act_shape_n[agent_id]))
            final_r = self.value(final_state, one_hot_action)

        rewards = np.array(rewards)
        for agent_id in range(self.n_agents):
            rewards[:,
                    agent_id] = self._discount_reward(rewards[:, agent_id],
                                                      final_r[agent_id])
        rewards = rewards.tolist()
        # print(rewards)
        self.n_steps += 1
        self.memory.push(states, actions, rewards)