Beispiel #1
0
    def learn_Karpathy(self):
        """Learn using updates like in the Karpathy algorithm."""
        iteration = self.start_at_iter
        while iteration < self.n_iter and not self.master.stop_requested:  # Keep executing episodes until the master requests a stop (e.g. using SIGINT)
            iteration += 1
            trajectory = self.task_runner.get_trajectory()
            reward = sum(trajectory["reward"])
            action_taken = trajectory["action"]

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = discounted_episode_rewards

            results = self.master.session.run(
                [self.loss, self.apply_grad],
                feed_dict={
                    self.master.states: trajectory["state"],
                    self.master.action_taken: action_taken,
                    self.master.advantage: feedback
                })
            results = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: reward,
                    self.master.episode_length: trajectory["steps"]
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()
Beispiel #2
0
 def learn_REINFORCE(self):
     """Learn using updates like in the REINFORCE algorithm."""
     reporter = Reporter()
     total_n_trajectories = 0
     iteration = self.start_at_iter
     while iteration < self.n_iter and not self.master.stop_requested:
         iteration += 1
         # Collect trajectories until we get timesteps_per_batch total timesteps
         trajectories = self.task_runner.get_trajectories()
         total_n_trajectories += len(trajectories)
         all_state = np.concatenate(
             [trajectory["state"] for trajectory in trajectories])
         # Compute discounted sums of rewards
         rets = [
             discount_rewards(trajectory["reward"], self.config["gamma"])
             for trajectory in trajectories
         ]
         max_len = max(len(ret) for ret in rets)
         padded_rets = [
             np.concatenate([ret, np.zeros(max_len - len(ret))])
             for ret in rets
         ]
         # Compute time-dependent baseline
         baseline = np.mean(padded_rets, axis=0)
         # Compute advantage function
         advs = [ret - baseline[:len(ret)] for ret in rets]
         all_action = np.concatenate(
             [trajectory["action"] for trajectory in trajectories])
         all_adv = np.concatenate(advs)
         # Do policy gradient update step
         episode_rewards = np.array([
             trajectory["reward"].sum() for trajectory in trajectories
         ])  # episode total rewards
         episode_lengths = np.array([
             len(trajectory["reward"]) for trajectory in trajectories
         ])  # episode lengths
         results = self.master.session.run(
             [self.loss, self.apply_grad],
             feed_dict={
                 self.master.states: all_state,
                 self.master.action_taken: all_action,
                 self.master.advantage: all_adv
             })
         print("Task:", self.task_id)
         reporter.print_iteration_stats(iteration, episode_rewards,
                                        episode_lengths,
                                        total_n_trajectories)
         summary = self.master.session.run(
             [self.master.summary_op],
             feed_dict={
                 self.master.loss: results[0],
                 self.master.reward: np.mean(episode_rewards),
                 self.master.episode_length: np.mean(episode_lengths)
             })
         self.writer.add_summary(summary[0], iteration)
         self.writer.flush()
Beispiel #3
0
 def run(self):
     # Assume global shared parameter vectors θ and θv and global shared counter T = 0
     # Assume thread-specific parameter vectors θ' and θ'v
     sess = self.master.session
     self.runner.start_runner(sess, self.writer)
     t = 1  # thread step counter
     while self.master.T < self.config[
             "T_max"] and not self.master.stop_requested:
         # Synchronize thread-specific parameters θ' = θ and θ'v = θv
         sess.run(self.sync_net)
         trajectory = self.pull_batch_from_queue()
         v = 0 if trajectory.terminal else self.get_critic_value(
             np.asarray(trajectory.states)[None,
                                           -1], trajectory.features[-1][0])
         rewards_plus_v = np.asarray(trajectory.rewards + [v])
         vpred_t = np.asarray(trajectory.values + [v])
         delta_t = trajectory.rewards + self.config["gamma"] * vpred_t[
             1:] - vpred_t[:-1]
         batch_r = discount_rewards(rewards_plus_v,
                                    self.config["gamma"])[:-1]
         batch_adv = discount_rewards(delta_t, self.config["gamma"])
         fetches = self.losses + [self.train_op, self.master.global_step]
         states = np.asarray(trajectory.states)
         feed_dict = {
             self.actor_states: states,
             self.critic_states: states,
             self.actions_taken: np.asarray(trajectory.actions),
             self.adv: batch_adv,
             self.r: np.asarray(batch_r)
         }
         feature = trajectory.features[0][0]
         if feature != []:
             feed_dict[self.ac_net.rnn_state_in] = feature
         results = sess.run(fetches, feed_dict)
         n_states = states.shape[0]
         feed_dict = dict(
             zip(self.master.losses, map(lambda x: x / n_states, results)))
         summary = sess.run([self.master.summary_op], feed_dict)
         self.writer.add_summary(summary[0], results[-1])
         self.writer.flush()
         t += 1
         self.master.T += trajectory.steps
     self.runner.stop_requested = True
Beispiel #4
0
    def learn(self):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory()

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.states: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #5
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory.states for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory.rewards, config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory.actions for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                sum(trajectory.rewards) for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory.rewards) for trajectory in trajectories
            ])  # episode lengths
            # TODO: deal with RNN state
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.a_n: all_action,
                    self.adv_n: all_adv,
                    self.episode_lengths: np.mean(episode_lengths),
                    self.rewards: np.mean(episode_rewards)
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #6
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        possible_actions = np.arange(self.env_runner.nA)
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_action = (possible_actions == all_action[:, None]).astype(
                np.float32)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            returns = np.concatenate([
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ])
            qw_new = self.get_critic_value(all_state)

            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths

            results = self.session.run(
                [self.summary_op, self.critic_train, self.actor_train],
                feed_dict={
                    self.states: all_state,
                    self.critic_target: returns,
                    self.states: all_state,
                    self.actions_taken: all_action,
                    self.critic_feedback: qw_new,
                    self.critic_rewards: returns,
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #7
0
    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = np.zeros(len(self.envs))
        for iteration in range(config["n_iter"]):
            self.session.run([self.reset_accum_grads])
            for i, task_runner in enumerate(self.task_runners):
                if self.config["switch_at_iter"] is not None:
                    if iteration >= self.config["switch_at_iter"] and i != (len(self.task_runners) - 1):
                        continue
                    elif iteration < self.config["switch_at_iter"] and i == len(self.task_runners) - 1:
                        continue
                # Collect trajectories until we get timesteps_per_batch total timesteps
                trajectories = task_runner.get_trajectories()
                total_n_trajectories[i] += len(trajectories)
                all_state = np.concatenate([trajectory["state"] for trajectory in trajectories])
                # Compute discounted sums of rewards
                rets = [discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories]
                max_len = max(len(ret) for ret in rets)
                padded_rets = [np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets]
                # Compute time-dependent baseline
                baseline = np.mean(padded_rets, axis=0)
                # Compute advantage function
                advs = [ret - baseline[:len(ret)] for ret in rets]
                all_action = np.concatenate([trajectory["action"] for trajectory in trajectories])
                all_adv = np.concatenate(advs)
                # Do policy gradient update step
                episode_rewards = np.array([trajectory["reward"].sum() for trajectory in trajectories])  # episode total rewards
                episode_lengths = np.array([len(trajectory["reward"]) for trajectory in trajectories])  # episode lengths
                results = self.session.run([self.losses[i], self.add_accum_grads[i], self.accum_grads], feed_dict={
                    self.states: all_state,
                    self.action_taken: all_action,
                    self.advantage: all_adv
                })
                summary = self.session.run([self.summary_op], feed_dict={
                    self.loss: results[0],
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })

                self.writers[i].add_summary(summary[0], iteration)
                self.writers[i].flush()
                print("Task:", i)
                reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i])

            # Apply accumulated gradient after all the gradients of each task are summed
            self.session.run([self.apply_gradients])

        if self.config["save_model"]:
            if not os.path.exists(self.monitor_path):
                os.makedirs(self.monitor_path)
            self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
Beispiel #8
0
 def learn(self):
     # Assume global shared parameter vectors θ and θv and global shared counter T = 0
     # Assume thread-specific parameter vectors θ' and θ'v
     if self.task_id != 0:
         time.sleep(5)
     with tf.train.MonitoredTrainingSession(
         master=self.server.target,
         is_chief=(self.task_id == 0),
         config=self.config_proto,
         save_summaries_secs=30,
         scaffold=self.scaffold
     ) as sess:
         self.session = sess
         sess.run(self.sync_net)
         self.runner.start_runner(sess, self.writer)
         while not sess.should_stop() and self.global_step < self.config["T_max"]:
             # Synchronize thread-specific parameters θ' = θ and θ'v = θv
             sess.run(self.sync_net)
             trajectory = self.pull_batch_from_queue()
             v = 0 if trajectory.terminal else self.get_critic_value(
                 np.asarray(trajectory.states)[None, -1], trajectory.features[-1])
             rewards_plus_v = np.asarray(trajectory.rewards + [v])
             vpred_t = np.asarray(trajectory.values + [v])
             delta_t = trajectory.rewards + self.config["gamma"] * vpred_t[1:] - vpred_t[:-1]
             batch_r = discount_rewards(rewards_plus_v, self.config["gamma"])[:-1]
             batch_adv = discount_rewards(delta_t, self.config["gamma"])
             fetches = [self.summary_op, self.train_op, self._global_step]
             states = np.asarray(trajectory.states)
             feed_dict = {
                 self.states: states,
                 self.actions_taken: np.asarray(trajectory.actions),
                 self.advantage: batch_adv,
                 self.ret: np.asarray(batch_r)
             }
             feature = trajectory.features[0]
             if feature != [] and feature is not None:
                 feed_dict[self.local_network.rnn_state_in] = feature
             summary, _, global_step = sess.run(fetches, feed_dict)
             self.writer.add_summary(summary, global_step)
             self.writer.flush()
Beispiel #9
0
    def learn(self):
        """Run learning algorithm"""
        config = self.config
        for _ in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectory = self.env_runner.get_steps(
                self.config["n_local_steps"])
            v = 0 if trajectory.terminals[-1] else self.get_critic_value(
                np.asarray(trajectory.states)[None,
                                              -1], trajectory.features[-1])
            rewards_plus_v = np.asarray(trajectory.rewards + [v])
            vpred_t = np.asarray(trajectory.values + [v])
            delta_t = trajectory.rewards + \
                self.config["gamma"] * vpred_t[1:] - vpred_t[:-1]
            batch_r = discount_rewards(rewards_plus_v,
                                       self.config["gamma"])[:-1]
            batch_adv = discount_rewards(delta_t, self.config["gamma"])
            fetches = [self.loss_summary_op, self.train_op, self._global_step]
            states = np.asarray(trajectory.states)
            feed_dict = {
                self.states: states,
                self.actions_taken: np.asarray(trajectory.actions),
                self.advantage: batch_adv,
                self.ret: np.asarray(batch_r)
            }
            feature = trajectory.features[0]
            if feature != [] and feature is not None:
                feed_dict[self.ac_net.rnn_state_in] = feature
            summary, _, global_step = self.session.run(fetches, feed_dict)
            self.writer.add_summary(summary, global_step)
            self.writer.flush()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #10
0
    def learn(self):
        reporter = Reporter()

        gradient1 = np.zeros_like(self.w1)
        gradient2 = np.zeros_like(self.w2)

        rmsprop1 = np.zeros_like(self.w1)
        rmsprop2 = np.zeros_like(self.w2)

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory(self.config["episode_max_length"])

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding
            epdlogp = action_taken - trajectory["prob"]

            # episode_states = np.vstack(encountered_states)

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # print(discounted_episode_rewards)
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            discounted_episode_rewards /= np.std(discounted_episode_rewards)
            epdlogp *= np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            change_w1, change_w2 = self.backward_step(trajectory["state"],
                                                      trajectory['x1'],
                                                      epdlogp)

            gradient1 += change_w1
            gradient2 += change_w2

            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                rmsprop1 = self.config["decay_rate"] * rmsprop1 + (
                    1 - self.config["decay_rate"]) * gradient1**2
                rmsprop2 = self.config["decay_rate"] * rmsprop2 + (
                    1 - self.config["decay_rate"]) * gradient2**2
                self.w1 += self.config["learning_rate"] * gradient1 / (
                    np.sqrt(rmsprop1) + 1e-5)
                self.w2 += self.config["learning_rate"] * gradient2 / (
                    np.sqrt(rmsprop2) + 1e-5)
                gradient1 = np.zeros_like(self.w1)
                gradient2 = np.zeros_like(self.w2)
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)