Exemple #1
0
 def learn_REINFORCE(self):
     """Learn using updates like in the REINFORCE algorithm."""
     reporter = Reporter()
     total_n_trajectories = 0
     iteration = self.start_at_iter
     while iteration < self.n_iter and not self.master.stop_requested:
         iteration += 1
         # Collect trajectories until we get timesteps_per_batch total timesteps
         trajectories = self.task_runner.get_trajectories()
         total_n_trajectories += len(trajectories)
         all_state = np.concatenate(
             [trajectory["state"] for trajectory in trajectories])
         # Compute discounted sums of rewards
         rets = [
             discount_rewards(trajectory["reward"], self.config["gamma"])
             for trajectory in trajectories
         ]
         max_len = max(len(ret) for ret in rets)
         padded_rets = [
             np.concatenate([ret, np.zeros(max_len - len(ret))])
             for ret in rets
         ]
         # Compute time-dependent baseline
         baseline = np.mean(padded_rets, axis=0)
         # Compute advantage function
         advs = [ret - baseline[:len(ret)] for ret in rets]
         all_action = np.concatenate(
             [trajectory["action"] for trajectory in trajectories])
         all_adv = np.concatenate(advs)
         # Do policy gradient update step
         episode_rewards = np.array([
             trajectory["reward"].sum() for trajectory in trajectories
         ])  # episode total rewards
         episode_lengths = np.array([
             len(trajectory["reward"]) for trajectory in trajectories
         ])  # episode lengths
         results = self.master.session.run(
             [self.loss, self.apply_grad],
             feed_dict={
                 self.master.states: all_state,
                 self.master.action_taken: all_action,
                 self.master.advantage: all_adv
             })
         print("Task:", self.task_id)
         reporter.print_iteration_stats(iteration, episode_rewards,
                                        episode_lengths,
                                        total_n_trajectories)
         summary = self.master.session.run(
             [self.master.summary_op],
             feed_dict={
                 self.master.loss: results[0],
                 self.master.reward: np.mean(episode_rewards),
                 self.master.episode_length: np.mean(episode_lengths)
             })
         self.writer.add_summary(summary[0], iteration)
         self.writer.flush()
Exemple #2
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory.states for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory.rewards, config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory.actions for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                sum(trajectory.rewards) for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory.rewards) for trajectory in trajectories
            ])  # episode lengths
            # TODO: deal with RNN state
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.a_n: all_action,
                    self.adv_n: all_adv,
                    self.episode_lengths: np.mean(episode_lengths),
                    self.rewards: np.mean(episode_rewards)
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Exemple #3
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        possible_actions = np.arange(self.env_runner.nA)
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_action = (possible_actions == all_action[:, None]).astype(
                np.float32)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            returns = np.concatenate([
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ])
            qw_new = self.get_critic_value(all_state)

            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths

            results = self.session.run(
                [self.summary_op, self.critic_train, self.actor_train],
                feed_dict={
                    self.states: all_state,
                    self.critic_target: returns,
                    self.states: all_state,
                    self.actions_taken: all_action,
                    self.critic_feedback: qw_new,
                    self.critic_rewards: returns,
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Exemple #4
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = np.zeros(len(self.envs))
        for iteration in range(config["n_iter"]):
            self.session.run([self.reset_accum_grads])
            for i, task_runner in enumerate(self.task_runners):
                if self.config["switch_at_iter"] is not None:
                    if iteration >= self.config["switch_at_iter"] and i != (len(self.task_runners) - 1):
                        continue
                    elif iteration < self.config["switch_at_iter"] and i == len(self.task_runners) - 1:
                        continue
                # Collect trajectories until we get timesteps_per_batch total timesteps
                trajectories = task_runner.get_trajectories()
                total_n_trajectories[i] += len(trajectories)
                all_state = np.concatenate([trajectory["state"] for trajectory in trajectories])
                # Compute discounted sums of rewards
                rets = [discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories]
                max_len = max(len(ret) for ret in rets)
                padded_rets = [np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets]
                # Compute time-dependent baseline
                baseline = np.mean(padded_rets, axis=0)
                # Compute advantage function
                advs = [ret - baseline[:len(ret)] for ret in rets]
                all_action = np.concatenate([trajectory["action"] for trajectory in trajectories])
                all_adv = np.concatenate(advs)
                # Do policy gradient update step
                episode_rewards = np.array([trajectory["reward"].sum() for trajectory in trajectories])  # episode total rewards
                episode_lengths = np.array([len(trajectory["reward"]) for trajectory in trajectories])  # episode lengths
                results = self.session.run([self.losses[i], self.add_accum_grads[i], self.accum_grads], feed_dict={
                    self.states: all_state,
                    self.action_taken: all_action,
                    self.advantage: all_adv
                })
                summary = self.session.run([self.summary_op], feed_dict={
                    self.loss: results[0],
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })

                self.writers[i].add_summary(summary[0], iteration)
                self.writers[i].flush()
                print("Task:", i)
                reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i])

            # Apply accumulated gradient after all the gradients of each task are summed
            self.session.run([self.apply_gradients])

        if self.config["save_model"]:
            if not os.path.exists(self.monitor_path):
                os.makedirs(self.monitor_path)
            self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
Exemple #5
0
    def learn(self):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory()

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.states: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Exemple #6
0
    def learn(self):
        reporter = Reporter()

        gradient1 = np.zeros_like(self.w1)
        gradient2 = np.zeros_like(self.w2)

        rmsprop1 = np.zeros_like(self.w1)
        rmsprop2 = np.zeros_like(self.w2)

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding
            epdlogp = action_taken - trajectory["prob"]

            # episode_states = np.vstack(encountered_states)

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # print(discounted_episode_rewards)
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            discounted_episode_rewards /= np.std(discounted_episode_rewards)
            epdlogp *= np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            change_w1, change_w2 = self.backward_step(trajectory["state"],
                                                      trajectory['x1'],
                                                      epdlogp)

            gradient1 += change_w1
            gradient2 += change_w2

            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                rmsprop1 = self.config["decay_rate"] * rmsprop1 + (
                    1 - self.config["decay_rate"]) * gradient1**2
                rmsprop2 = self.config["decay_rate"] * rmsprop2 + (
                    1 - self.config["decay_rate"]) * gradient2**2
                self.w1 += self.config["learning_rate"] * gradient1 / (
                    np.sqrt(rmsprop1) + 1e-5)
                self.w2 += self.config["learning_rate"] * gradient2 / (
                    np.sqrt(rmsprop2) + 1e-5)
                gradient1 = np.zeros_like(self.w1)
                gradient2 = np.zeros_like(self.w2)
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)