Example #1
0
 def __init__(self, env, monitor_path, video=True, **usercfg):
     super(KarpathyCNN, self).__init__(**usercfg)
     self.env = wrappers.Monitor(env,
                                 monitor_path,
                                 force=True,
                                 video_callable=(None if video else False))
     self.nA = env.action_space.n
     self.monitor_path = monitor_path
     # Default configuration. Can be overwritten using keyword arguments.
     self.config.update(
         dict(
             # timesteps_per_batch=10000,
             # n_iter=100,
             n_hidden_units=200,
             learning_rate=1e-3,
             batch_size=
             10,  # Amount of episodes after which to adapt gradients
             gamma=0.99,  # Discount past rewards by a percentage
             decay=0.99,  # Decay of RMSProp optimizer
             epsilon=1e-9,  # Epsilon of RMSProp optimizer
             draw_frequency=50  # Draw a plot every 50 episodes
         ))
     self.config.update(usercfg)
     self.build_network()
     if self.config["save_model"]:
         tf.add_to_collection("action", self.action)
         tf.add_to_collection("states", self.states)
         self.saver = FastSaver()
Example #2
0
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(dict(
            timesteps_per_batch=10000,
            trajectories_per_batch=10,
            batch_update="timesteps",
            n_iter=100,
            switch_at_iter=None,
            gamma=0.99,  # Discount past rewards by a percentage
            decay=0.9,  # Decay of RMSProp optimizer
            epsilon=1e-9,  # Epsilon of RMSProp optimizer
            learning_rate=0.005,
            n_hidden_units=10,
            repeat_n_actions=1,
            n_sparse_units=10,
            feature_extraction=False
        ))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [EnvRunner(envs[i], TaskPolicy(action, self), self.config) for i, action in enumerate(self.action_tensors)]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
Example #3
0
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(REINFORCE, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.05,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.build_network()
        self.make_trainer()

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_loss = tf.summary.scalar("Loss", self.summary_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)
Example #4
0
    def __init__(self,
                 env_id: str,
                 task_id: int,
                 cluster: tf.train.ClusterDef,
                 monitor_path: str,
                 config: dict,
                 clip_gradients: bool = True,
                 video: bool = False,
                 seed: Optional[int] = None) -> None:
        super(A3CTask, self).__init__()
        self.task_id = task_id
        self.config = config
        self.clip_gradients = clip_gradients
        self.env = make(env_id)
        self.env.seed(seed)
        if task_id == 0:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False)
            )

        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None

        worker_device = "/job:worker/task:{}/cpu:0".format(task_id)
        # Global network
        shared_device = tf.train.replica_device_setter(
            ps_tasks=1,
            worker_device=worker_device,
            cluster=cluster)
        with tf.device(shared_device):
            with tf.variable_scope("global"):
                self.global_network = self.build_networks()
                self.global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self._global_step = tf.get_variable(
                    "global_step",
                    [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        # Local network
        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = self.build_networks()
                self.states = self.local_network.states
                self.actions_taken = self.local_network.actions_taken
                self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
                self.ret = tf.placeholder(tf.float32, [None], name="return")
                self.actor_loss, self.critic_loss, self.loss = self.make_loss()
                self.local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self.sync_net = create_sync_net_op(self.global_vars, self.local_vars)
                self.n_steps = tf.shape(self.local_network.states)[0]
                inc_step = self._global_step.assign_add(self.n_steps)

        device = shared_device if self.config["shared_optimizer"] else worker_device
        with tf.device(device):
            apply_optim_op = self.make_trainer()
            self.train_op = tf.group(apply_optim_op, inc_step)

            loss_summaries = self.create_summary_losses()
            self.reward = tf.placeholder("float", name="reward")
            tf.summary.scalar("Reward", self.reward)
            self.episode_length = tf.placeholder("float", name="episode_length")
            tf.summary.scalar("Episode_length", self.episode_length)
            self.summary_op = tf.summary.merge(loss_summaries)

        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
        saver = FastSaver(variables_to_save)
        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(os.path.join(monitor_path, "task{}".format(task_id)))

        self.runner = RunnerThread(self.env, self, self.config["n_local_steps"], task_id == 0 and video)

        self.server = tf.train.Server(
            cluster,
            job_name="worker",
            task_index=task_id,
            config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)
        )

        def init_fn(scaffold, sess):
            sess.run(init_all_op)

        self.report_uninit_op = tf.report_uninitialized_variables(variables_to_save)

        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_fn=init_fn,
            ready_for_local_init_op=self.report_uninit_op,
            saver=saver,
            ready_op=self.report_uninit_op
        )

        self.config_proto = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(task_id)])

        self.session = None
Example #5
0
    def __init__(self,
                 envs,
                 monitor_path,
                 learning_method="REINFORCE",
                 **usercfg):
        super(AsyncKnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.learning_method = learning_method
        self.monitor_path = monitor_path
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=200,
                switch_at_iter=
                None,  # None to deactivate, otherwhise an iteration at which to switch
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_task_variations=3,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.stop_requested = False

        self.session = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           initializer=tf.constant_initializer(
                                               0, dtype=tf.int32),
                                           trainable=False)
        self.build_networks()

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.reward = tf.placeholder("float", name="reward")
        summary_rewards = tf.summary.scalar("Reward", self.reward)
        self.episode_length = tf.placeholder("float", name="episode_length")
        summary_episode_lengths = tf.summary.scalar("Episode_length",
                                                    self.episode_length)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.jobs = []
        for i, env in enumerate(self.envs):
            self.jobs.append(
                self.make_thread(
                    env,
                    i,
                    self.config["switch_at_iter"]
                    if self.config["switch_at_iter"] is not None
                    and i != len(self.envs) - 1 else self.config["n_iter"],
                    start_at_iter=(0 if self.config["switch_at_iter"] is None
                                   or i != len(self.envs) - 1 else
                                   self.config["switch_at_iter"])))

        for i, job in enumerate(self.jobs):
            only_sparse = (self.config["switch_at_iter"] is not None
                           and i == len(self.jobs) - 1)
            grads = tf.gradients(
                job.loss, (self.shared_vars if not (only_sparse) else []) +
                [job.sparse_representation])
            job.apply_grad = job.optimizer.apply_gradients(
                zip(grads, (self.shared_vars if not (only_sparse) else []) +
                    [job.sparse_representation]),
                global_step=self.global_step)

        self.session.run(tf.global_variables_initializer())

        if self.config["save_model"]:
            for job in self.jobs:
                tf.add_to_collection("action", job.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
Example #6
0
class AsyncKnowledgeTransfer(Agent):
    """Asynchronous learner for variations of a task."""
    def __init__(self,
                 envs,
                 monitor_path,
                 learning_method="REINFORCE",
                 **usercfg):
        super(AsyncKnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.learning_method = learning_method
        self.monitor_path = monitor_path
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=200,
                switch_at_iter=
                None,  # None to deactivate, otherwhise an iteration at which to switch
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_task_variations=3,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.stop_requested = False

        self.session = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           initializer=tf.constant_initializer(
                                               0, dtype=tf.int32),
                                           trainable=False)
        self.build_networks()

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.reward = tf.placeholder("float", name="reward")
        summary_rewards = tf.summary.scalar("Reward", self.reward)
        self.episode_length = tf.placeholder("float", name="episode_length")
        summary_episode_lengths = tf.summary.scalar("Episode_length",
                                                    self.episode_length)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.jobs = []
        for i, env in enumerate(self.envs):
            self.jobs.append(
                self.make_thread(
                    env,
                    i,
                    self.config["switch_at_iter"]
                    if self.config["switch_at_iter"] is not None
                    and i != len(self.envs) - 1 else self.config["n_iter"],
                    start_at_iter=(0 if self.config["switch_at_iter"] is None
                                   or i != len(self.envs) - 1 else
                                   self.config["switch_at_iter"])))

        for i, job in enumerate(self.jobs):
            only_sparse = (self.config["switch_at_iter"] is not None
                           and i == len(self.jobs) - 1)
            grads = tf.gradients(
                job.loss, (self.shared_vars if not (only_sparse) else []) +
                [job.sparse_representation])
            job.apply_grad = job.optimizer.apply_gradients(
                zip(grads, (self.shared_vars if not (only_sparse) else []) +
                    [job.sparse_representation]),
                global_step=self.global_step)

        self.session.run(tf.global_variables_initializer())

        if self.config["save_model"]:
            for job in self.jobs:
                tf.add_to_collection("action", job.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_networks(self):
        with tf.variable_scope("shared"):
            self.states = tf.placeholder(
                tf.float32,
                [None] + list(self.envs[0].observation_space.shape),
                name="states")
            self.action_taken = tf.placeholder(tf.float32, name="action_taken")
            self.advantage = tf.placeholder(tf.float32, name="advantage")

            if self.config["feature_extraction"]:
                self.L1 = tf.contrib.layers.fully_connected(
                    inputs=self.states,
                    num_outputs=self.config["n_hidden_units"],
                    activation_fn=tf.tanh,
                    weights_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.02),
                    biases_initializer=tf.zeros_initializer(),
                    scope="L1")
            else:
                self.L1 = self.states
            self.knowledge_base = tf.Variable(tf.truncated_normal(
                [self.L1.get_shape()[-1].value, self.config["n_sparse_units"]],
                mean=0.0,
                stddev=0.02),
                                              name="knowledge_base")

            self.shared_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

    def signal_handler(self, signal, frame):
        """When a (SIGINT) signal is received, request the threads (via the master) to stop after completing an iteration."""
        logging.info("SIGINT signal received: Requesting a stop...")
        self.stop_requested = True

    def learn(self):
        signal.signal(signal.SIGINT, self.signal_handler)
        if self.config["switch_at_iter"] is None:
            idx = None
        else:
            idx = -1
        for job in self.jobs[:idx]:
            job.start()
        for job in self.jobs[:idx]:
            job.join()
        try:
            self.jobs[idx].start()
            self.jobs[idx].join()
        except TypeError:  # idx is None
            pass

        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))

    def make_thread(self, env, task_id, n_iter, start_at_iter=0):
        return AKTThread(self,
                         env,
                         task_id,
                         n_iter,
                         start_at_iter=start_at_iter)
Example #7
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 video: bool = True,
                 **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(n_iter=100,
                 gamma=0.99,
                 learning_rate=0.001,
                 n_hidden_units=20,
                 n_hidden_layers=1,
                 gradient_clip_value=0.5,
                 n_local_steps=20,
                 vf_coef=0.5,
                 entropy_coef=0.01,
                 loss_reducer="mean",
                 save_model=False))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(self.config["learning_rate"],
                                                name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(grads,
                                          self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar(
            "model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar(
            "model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss",
                                         tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
        return
Example #8
0
class A2C(Agent):
    """Advantage Actor Critic"""
    def __init__(self,
                 env,
                 monitor_path: str,
                 video: bool = True,
                 **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(n_iter=100,
                 gamma=0.99,
                 learning_rate=0.001,
                 n_hidden_units=20,
                 n_hidden_layers=1,
                 gradient_clip_value=0.5,
                 n_local_steps=20,
                 vf_coef=0.5,
                 entropy_coef=0.01,
                 loss_reducer="mean",
                 save_model=False))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(self.config["learning_rate"],
                                                name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(grads,
                                          self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar(
            "model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar(
            "model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss",
                                         tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
        return

    def build_networks(self):
        return NotImplementedError("Abstract method")

    def make_loss(self):
        return NotImplementedError("Abstract method")

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, features):
        return self.session.run([self.ac_net.value],
                                feed_dict={self.states: state})[0].flatten()

    def choose_action(self, state, features) -> dict:
        action, value = self.session.run(
            [self.ac_net.action, self.ac_net.value],
            feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action) -> int:
        return np.argmax(action)

    def learn(self):
        """Run learning algorithm"""
        config = self.config
        for _ in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectory = self.env_runner.get_steps(
                self.config["n_local_steps"])
            v = 0 if trajectory.terminals[-1] else self.get_critic_value(
                np.asarray(trajectory.states)[None,
                                              -1], trajectory.features[-1])
            rewards_plus_v = np.asarray(trajectory.rewards + [v])
            vpred_t = np.asarray(trajectory.values + [v])
            delta_t = trajectory.rewards + \
                self.config["gamma"] * vpred_t[1:] - vpred_t[:-1]
            batch_r = discount_rewards(rewards_plus_v,
                                       self.config["gamma"])[:-1]
            batch_adv = discount_rewards(delta_t, self.config["gamma"])
            fetches = [self.loss_summary_op, self.train_op, self._global_step]
            states = np.asarray(trajectory.states)
            feed_dict = {
                self.states: states,
                self.actions_taken: np.asarray(trajectory.actions),
                self.advantage: batch_adv,
                self.ret: np.asarray(batch_r)
            }
            feature = trajectory.features[0]
            if feature != [] and feature is not None:
                feed_dict[self.ac_net.rnn_state_in] = feature
            summary, _, global_step = self.session.run(fetches, feed_dict)
            self.writer.add_summary(summary, global_step)
            self.writer.flush()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Example #9
0
    def __init__(self, env, monitor_path, **usercfg):
        super(DPPO, self).__init__()
        self.env = env
        self.env_name: str = env.spec.id
        self.monitor_path: str = monitor_path

        self.comm = MPI.COMM_SELF

        self.config.update(dict(
            n_workers=3,
            n_hidden_units=20,
            n_hidden_layers=2,
            gamma=0.99,
            gae_lambda=0.95,
            learning_rate=2.5e-4,
            n_iter=10000,
            n_epochs=4,
            n_local_steps=128,
            gradient_clip_value=0.5,
            vf_coef=0.5,
            entropy_coef=0.01,
            cso_epsilon=0.1,  # Clipped surrogate objective epsilon
            learn_method="batches",
            batch_size=64,
            save_model=False
        ))
        self.config.update(usercfg)

        self.task_type = None  # To be filled in by subclasses

        self.n_updates: int = 0

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self.set_old_to_new = tf.group(
            *[v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars)])

        # Reduces by taking the mean instead of summing
        self.actor_loss = -tf.reduce_mean(self.make_actor_loss(self.old_network, self.new_network, self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self.n_steps = tf.shape(self.states)[0]
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_actor_loss = tf.summary.scalar(
            "model/Actor_loss", self.actor_loss)
        summary_critic_loss = tf.summary.scalar(
            "model/Critic_loss", self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)
        summary_entropy = tf.summary.scalar("model/Entropy", -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar(
            "model/grad_global_norm", tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        self.model_summary_op = tf.summary.merge([
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "master"))

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.config["gradient_clip_value"])

        with tf.variable_scope("optimizer"):
            self.optimizer = tf.train.AdamOptimizer(
                self.config["learning_rate"], name="optim")
            apply_grads = self.optimizer.apply_gradients(
                zip(grads, self.new_network_vars))

            inc_step = self._global_step.assign_add(self.n_steps)
            self.train_op = tf.group(apply_grads, inc_step)
        optimizer_variables = [var for var in tf.global_variables() if var.name.startswith("optimizer")]
        self.init_op = tf.variables_initializer(self.new_network_vars + optimizer_variables + [self._global_step])
Example #10
0
class KarpathyCNN(Agent):
    """Karpathy policy gradient learner using a convolutional neural network"""
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(KarpathyCNN, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.nA = env.action_space.n
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                # timesteps_per_batch=10000,
                # n_iter=100,
                n_hidden_units=200,
                learning_rate=1e-3,
                batch_size=
                10,  # Amount of episodes after which to adapt gradients
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.99,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                draw_frequency=50  # Draw a plot every 50 episodes
            ))
        self.config.update(usercfg)
        self.build_network()
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_network(self):
        image_size = 80
        image_depth = 1  # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images

        self.states = tf.placeholder(
            tf.float32, [None, image_size, image_size, image_depth],
            name="states")

        # Convolution layer 1
        depth = 32
        patch_size = 4
        self.w1 = tf.Variable(
            tf.truncated_normal([patch_size, patch_size, image_depth, depth],
                                stddev=0.01))
        self.b1 = tf.Variable(tf.zeros([depth]))
        self.L1 = tf.nn.relu(
            tf.nn.conv2d(
                self.states, self.w1, strides=[1, 2, 2, 1], padding="SAME") +
            self.b1)
        self.L1 = tf.nn.max_pool(self.L1,
                                 ksize=[1, 2, 2, 1],
                                 strides=[1, 2, 2, 1],
                                 padding="SAME")

        # Convolution layer 2
        self.w2 = tf.Variable(
            tf.truncated_normal([patch_size, patch_size, depth, depth],
                                stddev=0.01))
        self.b2 = tf.Variable(tf.zeros([depth]))
        self.L2 = tf.nn.relu(
            tf.nn.conv2d(
                self.L1, self.w2, strides=[1, 2, 2, 1], padding="SAME") +
            self.b2)

        # Flatten
        shape = self.L2.get_shape().as_list()
        reshape = tf.reshape(self.L2, [-1, shape[1] * shape[2] * shape[3]
                                       ])  # -1 for the (unknown) batch size

        # Fully connected layer 1
        self.L3 = tf.contrib.layers.fully_connected(
            inputs=reshape,
            num_outputs=self.config["n_hidden_units"],
            activation_fn=tf.nn.relu,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())

        # Fully connected layer 2
        self.probs = tf.contrib.layers.fully_connected(
            inputs=self.L3,
            num_outputs=self.nA,
            activation_fn=tf.nn.softmax,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())

        self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                 name="action")

        self.vars = [
            self.w1, self.b1, self.w2, self.b2, self.w3, self.b3, self.w4,
            self.b4
        ]

        self.action_taken = tf.placeholder(tf.float32,
                                           shape=[None, self.nA],
                                           name="action_taken")
        self.feedback = tf.placeholder(tf.float32,
                                       shape=[None, self.nA],
                                       name="feedback")
        loss = tf.reduce_mean(
            tf.squared_difference(self.action_taken, self.probs) *
            self.feedback)

        self.create_accumulative_grads = create_accumulative_gradients_op(
            self.vars)
        self.accumulate_grads = add_accumulative_gradients_op(
            self.vars, self.create_accumulative_grads, loss)
        self.reset_accumulative_grads = reset_accumulative_gradients_op(
            self.vars, self.create_accumulative_grads)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])

        self.apply_gradients = self.optimizer.apply_gradients(
            zip(self.create_accumulative_grads, self.vars))

        init = tf.global_variables_initializer()

        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)

    def choose_action(self, state):
        return self.session.run([self.action],
                                feed_dict={self.states: [state]})[0]

    def get_trajectory(self, render=False):
        """
        Run agent-environment loop for one whole episode (trajectory)
        Return dictionary of results
        Note that this function returns more than the get_trajectory in the EnvRunner class.
        """
        state = preprocess_image(self.env.reset())
        prev_state = state
        states = []
        actions = []
        rewards = []
        for _ in range(self.config["episode_max_length"]):
            delta = state - prev_state
            action = self.choose_action(delta)
            states.append(delta)
            prev_state = state
            state, rew, done, _ = self.env.step(action)
            state = preprocess_image(state)
            actions.append(action)
            rewards.append(rew)
            if done:
                break
            if render:
                self.env.render()
        return {
            "reward": np.array(rewards),
            "state": np.array(states),
            "action": np.array(actions),
        }

    def learn(self):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory()

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.states: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Example #11
0
    def __init__(self, env, monitor_path: str, video=False, **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2  # Clipped surrogate objective epsilon
            ))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        ratio = tf.exp(self.new_network.action_log_prob -
                       self.old_network.action_log_prob)
        ratio_clipped = tf.clip_by_value(ratio,
                                         1.0 - self.config["cso_epsilon"],
                                         1.0 + self.config["cso_epsilon"])
        cso_loss = tf.minimum(ratio * self.advantage,
                              ratio_clipped * self.advantage)
        self.actor_loss = -tf.reduce_mean(cso_loss)
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        self.session = tf.Session()
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std", adv_std)

        ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        summary_ret = tf.summary.scalar("model/return/mean",
                                        tf.reduce_mean(self.ret))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries = []
        for v in tf.trainable_variables():
            if "new_network" in v.name:
                summaries.append(tf.summary.histogram(v.name, v))
        summaries += [
            summary_actor_loss, summary_critic_loss, summary_loss,
            summary_adv_mean, summary_adv_std, summary_ratio_mean,
            summary_ratio_std, summary_new_log_prob_mean,
            summary_old_log_prob_mean, summary_ret, summary_entropy,
            summary_grad_norm, summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        self.session.run(init)
        return
Example #12
0
class PPO(Agent):
    """Proximal Policy Optimization agent."""
    RNN = False

    def __init__(self, env, monitor_path: str, video=False, **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2  # Clipped surrogate objective epsilon
            ))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        ratio = tf.exp(self.new_network.action_log_prob -
                       self.old_network.action_log_prob)
        ratio_clipped = tf.clip_by_value(ratio,
                                         1.0 - self.config["cso_epsilon"],
                                         1.0 + self.config["cso_epsilon"])
        cso_loss = tf.minimum(ratio * self.advantage,
                              ratio_clipped * self.advantage)
        self.actor_loss = -tf.reduce_mean(cso_loss)
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        self.session = tf.Session()
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std", adv_std)

        ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        summary_ret = tf.summary.scalar("model/return/mean",
                                        tf.reduce_mean(self.ret))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries = []
        for v in tf.trainable_variables():
            if "new_network" in v.name:
                summaries.append(tf.summary.histogram(v.name, v))
        summaries += [
            summary_actor_loss, summary_critic_loss, summary_loss,
            summary_adv_mean, summary_adv_std, summary_ratio_mean,
            summary_ratio_std, summary_new_log_prob_mean,
            summary_old_log_prob_mean, summary_ret, summary_entropy,
            summary_grad_norm, summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        init = tf.global_variables_initializer()
        self.session.run(init)
        return

    def make_actor_loss(self, old_network, new_network, advantage):
        return ppo_loss(old_network.action_log_prob,
                        new_network.action_log_prob,
                        self.config["cso_epsilon"], advantage)

    def build_networks(self):
        raise NotImplementedError

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, *rest):
        return self.session.run([self.value], feed_dict={self.states:
                                                         state})[0].flatten()

    def choose_action(self, state, *rest):
        action, value = self.session.run([self.action, self.value],
                                         feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action):
        return np.argmax(action)

    def get_processed_trajectories(self):
        experiences = self.env_runner.get_steps(self.config["n_local_steps"],
                                                stop_at_trajectory_end=False)
        T = experiences.steps
        v = 0 if experiences.terminals[-1] else self.get_critic_value(
            np.asarray(experiences.states)[None, -1], experiences.features[-1])
        vpred = np.asarray(experiences.values + [v])
        gamma = self.config["gamma"]
        lambda_ = self.config["gae_lambda"]
        gaelam = advantages = np.empty(T, 'float32')
        last_gaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - experiences.terminals[t]
            delta = experiences.rewards[t] + gamma * vpred[
                t + 1] * nonterminal - vpred[t]
            gaelam[
                t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam
        rs = advantages + experiences.values
        return experiences.states, experiences.actions, advantages, rs, experiences.features

    def learn(self):
        """Run learning algorithm"""
        config = self.config
        n_updates = 0
        for _ in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            states, actions, advs, rs, _ = self.get_processed_trajectories()
            advs = np.array(advs)
            advs = (advs - advs.mean()) / advs.std()
            self.session.run(self.set_old_to_new)

            indices = np.arange(len(states))
            for _ in range(self.config["n_epochs"]):
                np.random.shuffle(indices)

                batch_size = self.config["batch_size"]
                for j in range(0, len(states), batch_size):
                    batch_indices = indices[j:(j + batch_size)]
                    batch_states = np.array(states)[batch_indices]
                    batch_actions = np.array(actions)[batch_indices]
                    batch_advs = np.array(advs)[batch_indices]
                    batch_rs = np.array(rs)[batch_indices]
                    losses = [self.actor_loss, self.critic_loss, self.loss]
                    fetches = losses + [self.model_summary_op, self.train_op]
                    feed_dict = {
                        self.states: batch_states,
                        self.old_network.states: batch_states,
                        self.actions_taken: batch_actions,
                        self.old_network.actions_taken: batch_actions,
                        self.advantage: batch_advs,
                        self.ret: batch_rs
                    }
                    results = self.session.run(fetches, feed_dict)
                    self.writer.add_summary(results[len(losses)], n_updates)
                    n_updates += 1
                self.writer.flush()

            if self.config["save_model"]:
                self.saver.save(self.session,
                                os.path.join(self.monitor_path, "model"))
Example #13
0
class KnowledgeTransfer(Agent):
    """Learner for variations of a task."""
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(dict(
            timesteps_per_batch=10000,
            trajectories_per_batch=10,
            batch_update="timesteps",
            n_iter=100,
            switch_at_iter=None,
            gamma=0.99,  # Discount past rewards by a percentage
            decay=0.9,  # Decay of RMSProp optimizer
            epsilon=1e-9,  # Epsilon of RMSProp optimizer
            learning_rate=0.005,
            n_hidden_units=10,
            repeat_n_actions=1,
            n_sparse_units=10,
            feature_extraction=False
        ))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [EnvRunner(envs[i], TaskPolicy(action, self), self.config) for i, action in enumerate(self.action_tensors)]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_networks(self):
        self.session = tf.Session()

        with tf.variable_scope("shared"):
            self.states = tf.placeholder(tf.float32, [None] + list(self.envs[0].observation_space.shape), name="states")
            self.action_taken = tf.placeholder(tf.float32, name="action_taken")
            self.advantage = tf.placeholder(tf.float32, name="advantage")

            L1 = None
            if self.config["feature_extraction"]:
                L1 = tf.contrib.layers.fully_connected(
                    inputs=self.states,
                    num_outputs=self.config["n_hidden_units"],
                    activation_fn=tf.tanh,
                    weights_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.02),
                    biases_initializer=tf.zeros_initializer(),
                    scope="L1")
            else:
                L1 = self.states

            knowledge_base = tf.Variable(tf.truncated_normal([L1.get_shape()[-1].value, self.config["n_sparse_units"]], mean=0.0, stddev=0.02), name="knowledge_base")

            self.shared_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="shared")

        # Every task has its own (sparse) representation
        sparse_representations = [
            tf.Variable(tf.truncated_normal([self.config["n_sparse_units"], self.nA], mean=0.0, stddev=0.02), name="sparse%d" % i)
            for i in range(self.n_tasks)
        ]

        self.probs_tensors = [tf.nn.softmax(tf.matmul(L1, tf.matmul(knowledge_base, s))) for s in sparse_representations]
        self.action_tensors = [tf.squeeze(tf.multinomial(tf.log(probs), 1)) for probs in self.probs_tensors]

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"]
        )
        net_vars = self.shared_vars + sparse_representations
        self.accum_grads = create_accumulative_gradients_op(net_vars, 0)

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.rewards = tf.placeholder("float", name="Rewards")
        summary_rewards = tf.summary.scalar("Reward", self.rewards)
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_episode_lengths = tf.summary.scalar("Length", self.episode_lengths)
        self.summary_op = tf.summary.merge([summary_loss, summary_rewards, summary_episode_lengths])

        self.writers = []
        self.losses = []

        regularizer = tf.contrib.layers.l1_regularizer(.05)
        for i, probabilities in enumerate(self.probs_tensors):
            good_probabilities = tf.reduce_sum(tf.multiply(probabilities, tf.one_hot(tf.cast(self.action_taken, tf.int32), self.nA)), reduction_indices=[1])
            eligibility = tf.log(good_probabilities) * self.advantage
            loss = -tf.reduce_sum(eligibility) + regularizer(sparse_representations[i])
            self.losses.append(loss)
            writer = tf.summary.FileWriter(os.path.join(self.monitor_path, "task" + str(i)), self.session.graph)
            self.writers.append(writer)

        # An add op for every task & its loss
        self.add_accum_grads = []
        for i, loss in enumerate(self.losses):
            # Use all variables if the switch tasks experiment is disactivated or it's not the last task
            all_vars = self.config["switch_at_iter"] is None or i != len(self.losses) - 1
            self.add_accum_grads.append(add_accumulative_gradients_op(
                (self.shared_vars if all_vars else []) + [sparse_representations[i]],
                ([self.accum_grads[0]] if all_vars else []) + [self.accum_grads[i + 1]],
                loss,
                i
            ))

        self.apply_gradients = self.optimizer.apply_gradients(
            zip(self.accum_grads, net_vars))
        self.reset_accum_grads = reset_accumulative_gradients_op(net_vars, self.accum_grads, 0)

        init = tf.global_variables_initializer()

        # Launch the graph.
        self.session.run(init)

    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = np.zeros(len(self.envs))
        for iteration in range(config["n_iter"]):
            self.session.run([self.reset_accum_grads])
            for i, task_runner in enumerate(self.task_runners):
                if self.config["switch_at_iter"] is not None:
                    if iteration >= self.config["switch_at_iter"] and i != (len(self.task_runners) - 1):
                        continue
                    elif iteration < self.config["switch_at_iter"] and i == len(self.task_runners) - 1:
                        continue
                # Collect trajectories until we get timesteps_per_batch total timesteps
                trajectories = task_runner.get_trajectories()
                total_n_trajectories[i] += len(trajectories)
                all_state = np.concatenate([trajectory["state"] for trajectory in trajectories])
                # Compute discounted sums of rewards
                rets = [discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories]
                max_len = max(len(ret) for ret in rets)
                padded_rets = [np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets]
                # Compute time-dependent baseline
                baseline = np.mean(padded_rets, axis=0)
                # Compute advantage function
                advs = [ret - baseline[:len(ret)] for ret in rets]
                all_action = np.concatenate([trajectory["action"] for trajectory in trajectories])
                all_adv = np.concatenate(advs)
                # Do policy gradient update step
                episode_rewards = np.array([trajectory["reward"].sum() for trajectory in trajectories])  # episode total rewards
                episode_lengths = np.array([len(trajectory["reward"]) for trajectory in trajectories])  # episode lengths
                results = self.session.run([self.losses[i], self.add_accum_grads[i], self.accum_grads], feed_dict={
                    self.states: all_state,
                    self.action_taken: all_action,
                    self.advantage: all_adv
                })
                summary = self.session.run([self.summary_op], feed_dict={
                    self.loss: results[0],
                    self.rewards: np.mean(episode_rewards),
                    self.episode_lengths: np.mean(episode_lengths)
                })

                self.writers[i].add_summary(summary[0], iteration)
                self.writers[i].flush()
                print("Task:", i)
                reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i])

            # Apply accumulated gradient after all the gradients of each task are summed
            self.session.run([self.apply_gradients])

        if self.config["save_model"]:
            if not os.path.exists(self.monitor_path):
                os.makedirs(self.monitor_path)
            self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
Example #14
0
class REINFORCE(Agent):
    """
    REINFORCE with baselines
    """
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(REINFORCE, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.env_runner = EnvRunner(self.env, self, usercfg)
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.05,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.build_network()
        self.make_trainer()

        init = tf.global_variables_initializer()
        # Launch the graph.
        self.session = tf.Session()
        self.session.run(init)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        self.rewards = tf.placeholder("float", name="Rewards")
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_loss = tf.summary.scalar("Loss", self.summary_loss)
        summary_rewards = tf.summary.scalar("Rewards", self.rewards)
        summary_episode_lengths = tf.summary.scalar("Episode_lengths",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

    def choose_action(self, state, features):
        """Choose an action."""
        action = self.session.run([self.action],
                                  feed_dict={self.states: [state]})[0]
        return {"action": action}

    def learn(self):
        """Run learning algorithm"""
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory.states for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory.rewards, config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory.actions for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                sum(trajectory.rewards) for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory.rewards) for trajectory in trajectories
            ])  # episode lengths
            # TODO: deal with RNN state
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.a_n: all_action,
                    self.adv_n: all_adv,
                    self.episode_lengths: np.mean(episode_lengths),
                    self.rewards: np.mean(episode_rewards)
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))