Python discount Examples

Programming Language: Python

Namespace/Package Name: Helper

Method/Function: discount

Examples at hotexamples.com: 2

Python discount - 2 examples found. These are the top rated real world Python examples of Helper.discount extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: Worker.py Project: aakibinesar/AI-Controller-for-Autonomous-Car

    def train(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        values = rollout[:, 5]

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v: discounted_rewards,
                     self.local_AC.inputs: np.vstack(observations),
                     self.local_AC.actions: actions,
                     self.local_AC.advantages: advantages,
                     self.local_AC.state_in[0]: self.batch_rnn_state[0],
                     self.local_AC.state_in[1]: self.batch_rnn_state[1]}
        v_l, p_l, e_l, g_n, v_n, self.batch_rnn_state, _ = sess.run([self.local_AC.value_loss,
                                                                     self.local_AC.policy_loss,
                                                                     self.local_AC.entropy,
                                                                     self.local_AC.grad_norms,
                                                                     self.local_AC.var_norms,
                                                                     self.local_AC.state_out,
                                                                     self.local_AC.apply_grads],
                                                                    feed_dict=feed_dict)

        return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n

Example #2

Show file

    def train_weights_and_get_comm_gradients(self, rollout, sess, gamma, ac_network, bootstrap_value=0):
        rollout = np.array(rollout)
        observations = np.stack(rollout[:, 0])
        observations_central = np.stack(rollout[:, 1])
        mess_received = np.stack(rollout[:, 2])  # state t
        actions = rollout[:, 3]
        sent_message = np.vstack(rollout[:, 4])
        rewards = rollout[:, 5]
        # next_observations = rollout[:, 6]
        # next_mess_received = np.stack(rollout[:, 7])  # state t+1
        # terminals = rollout[:, 8]  # whether timestep t was terminal
        values = rollout[:, 9]

        # print("VALUE GRADS")
        # for i in range(len(observations)):
        #    print("\t", observations[i], mess_received[i], "\n\t", sent_message[i])

        # for o, m, a, r in zip(observations, mess_received, actions, rewards):
        #    print(o, m, a, r)
        # print()

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        # GAE (if epsilon=0, its the same as doing regular advantage)
        epsilon = 0
        if epsilon == 0:
            advantages = adv(discounted_rewards, value_plus)
        else:
            advantages = gae(gamma, epsilon, rewards, value_plus)
            advantages = discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {ac_network.target_v: discounted_rewards,
                     ac_network.inputs: observations,
                     ac_network.inputs_central: observations_central,
                     ac_network.inputs_comm: mess_received,
                     ac_network.actions: actions,
                     ac_network.advantages: advantages}

        v_l, p_l, grads_m, e_l, g_n, v_n, _ = sess.run([ac_network.value_loss,
                                                        ac_network.policy_loss,
                                                        ac_network.gradients_q_message,
                                                        ac_network.entropy,
                                                        ac_network.grad_norms,
                                                        ac_network.var_norms,
                                                        ac_network.apply_grads
                                                        ],
                                                       feed_dict=feed_dict)
        # print("VALUE LOSS", v_l)
        # print("MESSAGE GRADS\n", grads_m)

        # so we want apply gradients for [0,N-1] states, and we use the gradients of messages from [1, N]
        return observations[:-1], mess_received[:-1], sent_message[:-1], grads_m[0][1:], \
               v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n