def train(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        values = rollout[:, 5]

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v: discounted_rewards,
                     self.local_AC.inputs: np.vstack(observations),
                     self.local_AC.actions: actions,
                     self.local_AC.advantages: advantages,
                     self.local_AC.state_in[0]: self.batch_rnn_state[0],
                     self.local_AC.state_in[1]: self.batch_rnn_state[1]}
        v_l, p_l, e_l, g_n, v_n, self.batch_rnn_state, _ = sess.run([self.local_AC.value_loss,
                                                                     self.local_AC.policy_loss,
                                                                     self.local_AC.entropy,
                                                                     self.local_AC.grad_norms,
                                                                     self.local_AC.var_norms,
                                                                     self.local_AC.state_out,
                                                                     self.local_AC.apply_grads],
                                                                    feed_dict=feed_dict)

        return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n
Example #2
0
    def train_weights_and_get_comm_gradients(self, rollout, sess, gamma, ac_network, bootstrap_value=0):
        rollout = np.array(rollout)
        observations = np.stack(rollout[:, 0])
        observations_central = np.stack(rollout[:, 1])
        mess_received = np.stack(rollout[:, 2])  # state t
        actions = rollout[:, 3]
        sent_message = np.vstack(rollout[:, 4])
        rewards = rollout[:, 5]
        # next_observations = rollout[:, 6]
        # next_mess_received = np.stack(rollout[:, 7])  # state t+1
        # terminals = rollout[:, 8]  # whether timestep t was terminal
        values = rollout[:, 9]

        # print("VALUE GRADS")
        # for i in range(len(observations)):
        #    print("\t", observations[i], mess_received[i], "\n\t", sent_message[i])

        # for o, m, a, r in zip(observations, mess_received, actions, rewards):
        #    print(o, m, a, r)
        # print()

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        # GAE (if epsilon=0, its the same as doing regular advantage)
        epsilon = 0
        if epsilon == 0:
            advantages = adv(discounted_rewards, value_plus)
        else:
            advantages = gae(gamma, epsilon, rewards, value_plus)
            advantages = discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {ac_network.target_v: discounted_rewards,
                     ac_network.inputs: observations,
                     ac_network.inputs_central: observations_central,
                     ac_network.inputs_comm: mess_received,
                     ac_network.actions: actions,
                     ac_network.advantages: advantages}

        v_l, p_l, grads_m, e_l, g_n, v_n, _ = sess.run([ac_network.value_loss,
                                                        ac_network.policy_loss,
                                                        ac_network.gradients_q_message,
                                                        ac_network.entropy,
                                                        ac_network.grad_norms,
                                                        ac_network.var_norms,
                                                        ac_network.apply_grads
                                                        ],
                                                       feed_dict=feed_dict)
        # print("VALUE LOSS", v_l)
        # print("MESSAGE GRADS\n", grads_m)

        # so we want apply gradients for [0,N-1] states, and we use the gradients of messages from [1, N]
        return observations[:-1], mess_received[:-1], sent_message[:-1], grads_m[0][1:], \
               v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n