def train(self, rollout, sess, gamma, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] next_observations = rollout[:, 3] values = rollout[:, 5] # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(self.rewards_plus, gamma)[:-1] self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] advantages = discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save feed_dict = {self.local_AC.target_v: discounted_rewards, self.local_AC.inputs: np.vstack(observations), self.local_AC.actions: actions, self.local_AC.advantages: advantages, self.local_AC.state_in[0]: self.batch_rnn_state[0], self.local_AC.state_in[1]: self.batch_rnn_state[1]} v_l, p_l, e_l, g_n, v_n, self.batch_rnn_state, _ = sess.run([self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.state_out, self.local_AC.apply_grads], feed_dict=feed_dict) return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n
def train_weights_and_get_comm_gradients(self, rollout, sess, gamma, ac_network, bootstrap_value=0): rollout = np.array(rollout) observations = np.stack(rollout[:, 0]) observations_central = np.stack(rollout[:, 1]) mess_received = np.stack(rollout[:, 2]) # state t actions = rollout[:, 3] sent_message = np.vstack(rollout[:, 4]) rewards = rollout[:, 5] # next_observations = rollout[:, 6] # next_mess_received = np.stack(rollout[:, 7]) # state t+1 # terminals = rollout[:, 8] # whether timestep t was terminal values = rollout[:, 9] # print("VALUE GRADS") # for i in range(len(observations)): # print("\t", observations[i], mess_received[i], "\n\t", sent_message[i]) # for o, m, a, r in zip(observations, mess_received, actions, rewards): # print(o, m, a, r) # print() # Here we take the rewards and values from the rollout, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(rewards_plus, gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) # GAE (if epsilon=0, its the same as doing regular advantage) epsilon = 0 if epsilon == 0: advantages = adv(discounted_rewards, value_plus) else: advantages = gae(gamma, epsilon, rewards, value_plus) advantages = discount(advantages, gamma) # Update the global network using gradients from loss # Generate network statistics to periodically save feed_dict = {ac_network.target_v: discounted_rewards, ac_network.inputs: observations, ac_network.inputs_central: observations_central, ac_network.inputs_comm: mess_received, ac_network.actions: actions, ac_network.advantages: advantages} v_l, p_l, grads_m, e_l, g_n, v_n, _ = sess.run([ac_network.value_loss, ac_network.policy_loss, ac_network.gradients_q_message, ac_network.entropy, ac_network.grad_norms, ac_network.var_norms, ac_network.apply_grads ], feed_dict=feed_dict) # print("VALUE LOSS", v_l) # print("MESSAGE GRADS\n", grads_m) # so we want apply gradients for [0,N-1] states, and we use the gradients of messages from [1, N] return observations[:-1], mess_received[:-1], sent_message[:-1], grads_m[0][1:], \ v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n