Example #1
0
    def learn(self, env):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config['batch_size'])
        episode_rewards = np.zeros(self.config['batch_size'])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(env,
                                             self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config['batch_size']] = sum(
                trajectory['reward'])
            episode_lengths[episode_nr % self.config['batch_size']] = len(
                trajectory['reward'])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory['action'][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory['reward'], self.config['gamma'])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.state: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config['batch_size'] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config['draw_frequency'] == 0:
                    reporter.draw_rewards(mean_rewards)
Example #2
0
    def learn(self):
        reporter = Reporter()

        gradient1 = np.zeros_like(self.w1)
        gradient2 = np.zeros_like(self.w2)

        rmsprop1 = np.zeros_like(self.w1)
        rmsprop2 = np.zeros_like(self.w2)

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config['batch_size'])
        episode_rewards = np.zeros(self.config['batch_size'])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config['batch_size']] = sum(trajectory['reward'])
            episode_lengths[episode_nr % self.config['batch_size']] = len(trajectory['reward'])
            episode_nr += 1
            action_taken = (np.arange(self.nA) == trajectory['action'][:, None]).astype(np.float32)  # one-hot encoding
            epdlogp = action_taken - trajectory['prob']

            # episode_states = np.vstack(encountered_states)

            discounted_episode_rewards = discount_rewards(trajectory['reward'], self.config['gamma'])
            # print(discounted_episode_rewards)
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            discounted_episode_rewards /= np.std(discounted_episode_rewards)
            epdlogp *= np.reshape(np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA))

            change_w1, change_w2 = self.backward_step(trajectory['state'], trajectory['x1'], epdlogp)

            gradient1 += change_w1
            gradient2 += change_w2

            if episode_nr % self.config['batch_size'] == 0:  # batch is done
                iteration += 1
                rmsprop1 = self.config['decay_rate'] * rmsprop1 + (1 - self.config['decay_rate']) * gradient1**2
                rmsprop2 = self.config['decay_rate'] * rmsprop2 + (1 - self.config['decay_rate']) * gradient2**2
                self.w1 += self.config['learning_rate'] * gradient1 / (np.sqrt(rmsprop1) + 1e-5)
                self.w2 += self.config['learning_rate'] * gradient2 / (np.sqrt(rmsprop2) + 1e-5)
                gradient1 = np.zeros_like(self.w1)
                gradient2 = np.zeros_like(self.w2)
                reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config['draw_frequency'] == 0:
                    reporter.draw_rewards(mean_rewards)