Esempio n. 1
0
class AKTThread(Thread):
    """Asynchronous knowledge transfer learner thread. Used to learn using one specific variation of a task."""
    def __init__(self, master, env, task_id, n_iter, start_at_iter=0):
        super(AKTThread, self).__init__()
        self.master = master
        self.config = self.master.config
        self.task_id = task_id
        self.nA = env.action_space.n
        self.n_iter = n_iter
        self.start_at_iter = start_at_iter
        self.add_accum_grad = None  # To be filled in later

        self.build_networks()
        self.states = self.master.states
        self.session = self.master.session
        self.task_runner = EnvRunner(env, TaskPolicy(self.action, self),
                                     self.master.config)

        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(
            os.path.join(self.master.monitor_path, "task" + str(self.task_id)),
            self.master.session.graph)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])

    def build_networks(self):
        with tf.variable_scope("task{}".format(self.task_id)):
            self.sparse_representation = tf.Variable(
                tf.truncated_normal(
                    [self.master.config["n_sparse_units"], self.nA],
                    mean=0.0,
                    stddev=0.02))
            self.probs = tf.nn.softmax(
                tf.matmul(
                    self.master.L1,
                    tf.matmul(self.master.knowledge_base,
                              self.sparse_representation)))

            self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                     name="action")

            good_probabilities = tf.reduce_sum(tf.multiply(
                self.probs,
                tf.one_hot(tf.cast(self.master.action_taken, tf.int32),
                           self.nA)),
                                               reduction_indices=[1])
            eligibility = tf.log(good_probabilities +
                                 1e-10) * self.master.advantage
            self.loss = -tf.reduce_sum(eligibility)

    def run(self):
        """Run the appropriate learning algorithm."""
        if self.master.learning_method == "REINFORCE":
            self.learn_REINFORCE()
        else:
            self.learn_Karpathy()

    def learn_REINFORCE(self):
        """Learn using updates like in the REINFORCE algorithm."""
        reporter = Reporter()
        total_n_trajectories = 0
        iteration = self.start_at_iter
        while iteration < self.n_iter and not self.master.stop_requested:
            iteration += 1
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.task_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory["reward"], self.config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            results = self.master.session.run(
                [self.loss, self.apply_grad],
                feed_dict={
                    self.master.states: all_state,
                    self.master.action_taken: all_action,
                    self.master.advantage: all_adv
                })
            print("Task:", self.task_id)
            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
            summary = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: np.mean(episode_rewards),
                    self.master.episode_length: np.mean(episode_lengths)
                })
            self.writer.add_summary(summary[0], iteration)
            self.writer.flush()

    def learn_Karpathy(self):
        """Learn using updates like in the Karpathy algorithm."""
        iteration = self.start_at_iter
        while iteration < self.n_iter and not self.master.stop_requested:  # Keep executing episodes until the master requests a stop (e.g. using SIGINT)
            iteration += 1
            trajectory = self.task_runner.get_trajectory()
            reward = sum(trajectory["reward"])
            action_taken = trajectory["action"]

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = discounted_episode_rewards

            results = self.master.session.run(
                [self.loss, self.apply_grad],
                feed_dict={
                    self.master.states: trajectory["state"],
                    self.master.action_taken: action_taken,
                    self.master.advantage: feedback
                })
            results = self.master.session.run(
                [self.master.summary_op],
                feed_dict={
                    self.master.loss: results[0],
                    self.master.reward: reward,
                    self.master.episode_length: trajectory["steps"]
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()
Esempio n. 2
0
class REINFORCE(Agent):
    """
    REINFORCE with baselines
    """
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(REINFORCE, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.05,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.build_network()
        self.make_trainer()

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = tf.train.Saver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_loss = tf.summary.scalar("Loss", self.summary_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

    def choose_action(self, state):
        """Choose an action."""
        action = self.session.run([self.action],
                                  feed_dict={self.states: [state]})[0]
        return action

    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.a_n: all_action,
                    self.adv_n: all_adv,
                    self.episode_lengths: np.mean(episode_lengths),
                    self.rewards: np.mean(episode_rewards)
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Esempio n. 3
0
class A2C(Agent):
    """Advantage Actor Critic"""
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)

        self.config.update(
            dict(timesteps_per_batch=10000,
                 trajectories_per_batch=10,
                 batch_update="timesteps",
                 n_iter=100,
                 gamma=0.99,
                 actor_learning_rate=0.01,
                 critic_learning_rate=0.05,
                 actor_n_hidden=20,
                 critic_n_hidden=20,
                 repeat_n_actions=1,
                 save_model=False))
        self.config.update(usercfg)
        self.build_networks()
        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = tf.train.Saver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_actor_loss = tf.summary.scalar("Actor_loss",
                                               self.summary_actor_loss)
        summary_critic_loss = tf.summary.scalar("Critic_loss",
                                                self.summary_critic_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge([
            summary_actor_loss, summary_critic_loss, summary_rewards,
            summary_episode_lengths
        ])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        return

    def get_critic_value(self, state):
        return self.session.run([self.critic_value],
                                feed_dict={self.states: state})[0].flatten()

    def choose_action(self, state):
        """Choose an action."""
        return self.session.run([self.action],
                                feed_dict={self.states: [state]})[0]

    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        possible_actions = np.arange(self.env_runner.nA)
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_action = np.concatenate(
                [trajectory["action"] for trajectory in trajectories])
            all_action = (possible_actions == all_action[:, None]).astype(
                np.float32)
            all_state = np.concatenate(
                [trajectory["state"] for trajectory in trajectories])
            # Compute discounted sums of rewards
            returns = np.concatenate([
                discount_rewards(trajectory["reward"], config["gamma"])
                for trajectory in trajectories
            ])
            qw_new = self.get_critic_value(all_state)

            episode_rewards = np.array([
                trajectory["reward"].sum() for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory["reward"]) for trajectory in trajectories
            ])  # episode lengths

            results = self.session.run(
                [self.summary_op, self.critic_train, self.actor_train],
                feed_dict={
                    self.states: all_state,
                    self.critic_target: returns,
                    self.states: all_state,
                    self.actions_taken: all_action,
                    self.critic_feedback: qw_new,
                    self.critic_rewards: returns,
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })
            self.writer.add_summary(results[0], iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))