Beispiel #1
0
 def __init__(self, env, monitor_path, video=True, **usercfg):
     super(KarpathyCNN, self).__init__(**usercfg)
     self.env = wrappers.Monitor(env,
                                 monitor_path,
                                 force=True,
                                 video_callable=(None if video else False))
     self.nA = env.action_space.n
     self.monitor_path = monitor_path
     # Default configuration. Can be overwritten using keyword arguments.
     self.config.update(
         dict(
             # timesteps_per_batch=10000,
             # n_iter=100,
             n_hidden_units=200,
             learning_rate=1e-3,
             batch_size=
             10,  # Amount of episodes after which to adapt gradients
             gamma=0.99,  # Discount past rewards by a percentage
             decay=0.99,  # Decay of RMSProp optimizer
             epsilon=1e-9,  # Epsilon of RMSProp optimizer
             draw_frequency=50  # Draw a plot every 50 episodes
         ))
     self.config.update(usercfg)
     self.build_network()
     if self.config["save_model"]:
         tf.add_to_collection("action", self.action)
         tf.add_to_collection("states", self.states)
         self.saver = FastSaver()
Beispiel #2
0
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=100,
                switch_at_iter=None,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [
            EnvRunner(envs[i], TaskPolicy(action, self), self.config)
            for i, action in enumerate(self.action_tensors)
        ]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
Beispiel #3
0
    def __init__(self,
                 env_id: str,
                 task_id: int,
                 cluster: tf.train.ClusterDef,
                 monitor_path: str,
                 config: dict,
                 clip_gradients: bool = True,
                 video: bool = False,
                 seed: Optional[int] = None) -> None:
        super(A3CTask, self).__init__()
        self.task_id = task_id
        self.config = config
        self.clip_gradients = clip_gradients
        self.env = make(env_id)
        self.env.seed(seed)
        if task_id == 0:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False)
            )

        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None

        worker_device = "/job:worker/task:{}/cpu:0".format(task_id)
        # Global network
        shared_device = tf.train.replica_device_setter(
            ps_tasks=1,
            worker_device=worker_device,
            cluster=cluster)
        with tf.device(shared_device):
            with tf.variable_scope("global"):
                self.global_network = self.build_networks()
                self.global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self._global_step = tf.get_variable(
                    "global_step",
                    [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        # Local network
        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = self.build_networks()
                self.states = self.local_network.states
                self.actions_taken = self.local_network.actions_taken
                self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
                self.ret = tf.placeholder(tf.float32, [None], name="return")
                self.actor_loss, self.critic_loss, self.loss = self.make_loss()
                self.local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self.sync_net = create_sync_net_op(self.global_vars, self.local_vars)
                self.n_steps = tf.shape(self.local_network.states)[0]
                inc_step = self._global_step.assign_add(self.n_steps)

        device = shared_device if self.config["shared_optimizer"] else worker_device
        with tf.device(device):
            apply_optim_op = self.make_trainer()
            self.train_op = tf.group(apply_optim_op, inc_step)

            loss_summaries = self.create_summary_losses()
            self.reward = tf.placeholder("float", name="reward")
            tf.summary.scalar("Reward", self.reward)
            self.episode_length = tf.placeholder("float", name="episode_length")
            tf.summary.scalar("Episode_length", self.episode_length)
            self.summary_op = tf.summary.merge(loss_summaries)

        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
        saver = FastSaver(variables_to_save)
        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(os.path.join(monitor_path, "task{}".format(task_id)))

        self.runner = RunnerThread(self.env, self, int(self.config["n_local_steps"]), task_id == 0 and video)

        self.server = tf.train.Server(
            cluster,
            job_name="worker",
            task_index=task_id,
            config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)
        )

        def init_fn(scaffold, sess):
            sess.run(init_all_op)

        self.report_uninit_op = tf.report_uninitialized_variables(variables_to_save)

        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_fn=init_fn,
            ready_for_local_init_op=self.report_uninit_op,
            saver=saver,
            ready_op=self.report_uninit_op
        )

        self.config_proto = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(task_id)])

        self.session = None
Beispiel #4
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = True,
                 **usercfg) -> None:
        super(REINFORCE, self).__init__(**usercfg)
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                learning_rate=0.05,
                entropy_coef=1e-3,
                n_hidden_layers=2,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.states = tf.placeholder(tf.float32, [None] +
                                     list(self.env.observation_space.shape),
                                     name="states")  # Observation
        self.actions_taken = tf.placeholder(
            tf.float32, name="actions_taken")  # Discrete action
        self.advantage = tf.placeholder(tf.float32,
                                        name="advantage")  # Advantage
        self.build_network()
        self.make_trainer()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_loss = tf.summary.scalar("model/loss", self.summary_loss)
        summaries = [summary_loss]
        if hasattr(self, "entropy"):
            summary_entropy = tf.summary.scalar("model/entropy", self.entropy)
            summaries += [summary_entropy]
        self.summary_op = tf.summary.merge(summaries)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
Beispiel #5
0
class REINFORCE(Agent):
    """
    REINFORCE with baselines
    """
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = True,
                 **usercfg) -> None:
        super(REINFORCE, self).__init__(**usercfg)
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                learning_rate=0.05,
                entropy_coef=1e-3,
                n_hidden_layers=2,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.states = tf.placeholder(tf.float32, [None] +
                                     list(self.env.observation_space.shape),
                                     name="states")  # Observation
        self.actions_taken = tf.placeholder(
            tf.float32, name="actions_taken")  # Discrete action
        self.advantage = tf.placeholder(tf.float32,
                                        name="advantage")  # Advantage
        self.build_network()
        self.make_trainer()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_loss = tf.summary.scalar("model/loss", self.summary_loss)
        summaries = [summary_loss]
        if hasattr(self, "entropy"):
            summary_entropy = tf.summary.scalar("model/entropy", self.entropy)
            summaries += [summary_entropy]
        self.summary_op = tf.summary.merge(summaries)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)

    def _initialize(self) -> None:
        self.session.run(self.init_op)

    def build_network(self):
        raise NotImplementedError()

    def make_trainer(self):
        raise NotImplementedError()

    def choose_action(self, state, features) -> Dict[str, np.ndarray]:
        """Choose an action."""
        action = self.session.run([self.action],
                                  feed_dict={self.states: [state]})[0]
        return {"action": action}

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        reporter = Reporter()
        config = self.config
        total_n_trajectories = 0
        for iteration in range(config["n_iter"]):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectories = self.env_runner.get_trajectories()
            total_n_trajectories += len(trajectories)
            all_state = np.concatenate(
                [trajectory.states for trajectory in trajectories])
            # Compute discounted sums of rewards
            rets = [
                discount_rewards(trajectory.rewards, config["gamma"])
                for trajectory in trajectories
            ]
            max_len = max(len(ret) for ret in rets)
            padded_rets = [
                np.concatenate([ret, np.zeros(max_len - len(ret))])
                for ret in rets
            ]
            # Compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # Compute advantage function
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_action = np.concatenate(
                [trajectory.actions for trajectory in trajectories])
            all_adv = np.concatenate(advs)
            # Do policy gradient update step
            episode_rewards = np.array([
                sum(trajectory.rewards) for trajectory in trajectories
            ])  # episode total rewards
            episode_lengths = np.array([
                len(trajectory.rewards) for trajectory in trajectories
            ])  # episode lengths
            # TODO: deal with RNN state
            summary, _ = self.session.run(
                [self.summary_op, self.train],
                feed_dict={
                    self.states: all_state,
                    self.actions_taken: all_action,
                    self.advantage: all_adv
                })
            self.writer.add_summary(summary, iteration)
            self.writer.flush()

            reporter.print_iteration_stats(iteration, episode_rewards,
                                           episode_lengths,
                                           total_n_trajectories)
        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #6
0
    def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(
            env,
            monitor_path,
            force=True,
            video_callable=(None if video else False))

        self.config.update(dict(
            n_iter=100,
            gamma=0.99,
            learning_rate=0.001,
            n_hidden_units=20,
            n_hidden_layers=1,
            gradient_clip_value=0.5,
            n_local_steps=20,
            vf_coef=0.5,
            entropy_coef=0.01,
            loss_reducer="mean",
            save_model=False
        ))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(
            self.config["learning_rate"], name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(
            grads, self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
        return
Beispiel #7
0
class A2C(Agent):
    """Advantage Actor Critic"""

    def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(
            env,
            monitor_path,
            force=True,
            video_callable=(None if video else False))

        self.config.update(dict(
            n_iter=100,
            gamma=0.99,
            learning_rate=0.001,
            n_hidden_units=20,
            n_hidden_layers=1,
            gradient_clip_value=0.5,
            n_local_steps=20,
            vf_coef=0.5,
            entropy_coef=0.01,
            loss_reducer="mean",
            save_model=False
        ))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(
            self.config["learning_rate"], name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(
            grads, self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
        return

    def _initialize(self):
        self.session.run(self.init_op)

    def build_networks(self):
        return NotImplementedError("Abstract method")

    def make_loss(self):
        return NotImplementedError("Abstract method")

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, features):
        return self.session.run([self.ac_net.value], feed_dict={self.states: state})[0].flatten()

    def choose_action(self, state, features) -> dict:
        action, value = self.session.run(
            [self.ac_net.action, self.ac_net.value], feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action) -> int:
        return np.argmax(action)

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        config = self.config
        for _ in range(int(config["n_iter"])):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            trajectory = self.env_runner.get_steps(int(self.config["n_local_steps"]))
            v = 0 if trajectory.terminals[-1] else self.get_critic_value(
                np.asarray(trajectory.states)[None, -1], trajectory.features[-1])
            rewards_plus_v = np.asarray(trajectory.rewards + [v])
            vpred_t = np.asarray(trajectory.values + [v])
            delta_t = trajectory.rewards + \
                self.config["gamma"] * vpred_t[1:] - vpred_t[:-1]
            batch_r = discount_rewards(
                rewards_plus_v, self.config["gamma"])[:-1]
            batch_adv = discount_rewards(delta_t, self.config["gamma"])
            fetches = [self.loss_summary_op, self.train_op, self._global_step]
            states = np.asarray(trajectory.states)
            feed_dict = {
                self.states: states,
                self.actions_taken: np.asarray(trajectory.actions),
                self.advantage: batch_adv,
                self.ret: np.asarray(batch_r)
            }
            feature = trajectory.features[0]
            if feature != [] and feature is not None:
                feed_dict[self.ac_net.rnn_state_in] = feature
            summary, _, global_step = self.session.run(fetches, feed_dict)
            self.writer.add_summary(summary, global_step)
            self.writer.flush()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session, os.path.join(
                self.monitor_path, "model"))
Beispiel #8
0
    def __init__(self, env, monitor_path, **usercfg):
        super(DPPO, self).__init__()
        self.env = env
        self.env_name: str = env.spec.id
        self.monitor_path: str = monitor_path

        self.comm = MPI.COMM_SELF

        self.config.update(
            dict(
                n_workers=3,
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=2.5e-4,
                n_iter=10000,
                n_epochs=4,
                n_local_steps=128,
                gradient_clip_value=0.5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.1,  # Clipped surrogate objective epsilon
                learn_method="batches",
                batch_size=64,
                save_model=False))
        self.config.update(usercfg)

        self.task_type = None  # To be filled in by subclasses

        self.n_updates: int = 0

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        # Reduces by taking the mean instead of summing
        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self.n_steps = tf.shape(self.states)[0]
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)
        summary_entropy = tf.summary.scalar("model/Entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        self.model_summary_op = tf.summary.merge([
            summary_actor_loss, summary_critic_loss, summary_loss,
            summary_entropy, summary_grad_norm, summary_var_norm
        ])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "master"))

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])

        with tf.variable_scope("optimizer"):
            self.optimizer = tf.train.AdamOptimizer(
                self.config["learning_rate"], name="optim")
            apply_grads = self.optimizer.apply_gradients(
                zip(grads, self.new_network_vars))

            inc_step = self._global_step.assign_add(self.n_steps)
            self.train_op = tf.group(apply_grads, inc_step)
        optimizer_variables = [
            var for var in tf.global_variables()
            if var.name.startswith("optimizer")
        ]
        self.init_op = tf.variables_initializer(self.new_network_vars +
                                                optimizer_variables +
                                                [self._global_step])
Beispiel #9
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = False,
                 **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2,  # Clipped surrogate objective epsilon
                save_model=False))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std",
                                            tf.sqrt(adv_std))

        # TODO: get from ppo_loss function
        # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0])
        summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean)
        summary_ret_std = tf.summary.scalar("model/return/std",
                                            tf.sqrt(ret_std))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries: List[tf.Tensor] = []
        # Weight summaries: not turned on right now because they take too much space
        # TODO: use config to make this optional
        #for v in tf.trainable_variables():
        #    if "new_network" in v.name:
        #        summaries.append(tf.summary.histogram(v.name, v))
        summaries += self._specific_summaries()
        summaries += [
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_adv_mean,
            summary_adv_std,
            # summary_ratio_mean, summary_ratio_std,
            summary_new_log_prob_mean,
            summary_old_log_prob_mean,
            summary_ret_mean,
            summary_ret_std,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        return
Beispiel #10
0
class PPO(Agent):
    """Proximal Policy Optimization agent."""
    RNN = False

    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = False,
                 **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2,  # Clipped surrogate objective epsilon
                save_model=False))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std",
                                            tf.sqrt(adv_std))

        # TODO: get from ppo_loss function
        # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0])
        summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean)
        summary_ret_std = tf.summary.scalar("model/return/std",
                                            tf.sqrt(ret_std))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries: List[tf.Tensor] = []
        # Weight summaries: not turned on right now because they take too much space
        # TODO: use config to make this optional
        #for v in tf.trainable_variables():
        #    if "new_network" in v.name:
        #        summaries.append(tf.summary.histogram(v.name, v))
        summaries += self._specific_summaries()
        summaries += [
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_adv_mean,
            summary_adv_std,
            # summary_ratio_mean, summary_ratio_std,
            summary_new_log_prob_mean,
            summary_old_log_prob_mean,
            summary_ret_mean,
            summary_ret_std,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        return

    def _initialize(self):
        self.session.run(self.init_op)

    def _specific_summaries(self) -> List[tf.Tensor]:
        """Summaries that are specific to the variant of the algorithm. None (empty list) for the base algorithm"""
        return []

    def make_actor_loss(self, old_network, new_network, advantage):
        return ppo_loss(old_network.action_log_prob,
                        new_network.action_log_prob,
                        self.config["cso_epsilon"], advantage)

    def build_networks(self):
        raise NotImplementedError

    @property
    def global_step(self):
        return self._global_step.eval(session=self.session)

    def get_critic_value(self, state, *rest):
        return self.session.run([self.value], feed_dict={self.states:
                                                         state})[0].flatten()

    def choose_action(self, state, *rest):
        action, value = self.session.run([self.action, self.value],
                                         feed_dict={self.states: [state]})
        return {"action": action, "value": value[0]}

    def get_env_action(self, action):
        return np.argmax(action)

    def get_processed_trajectories(self):
        experiences = self.env_runner.get_steps(int(
            self.config["n_local_steps"]),
                                                stop_at_trajectory_end=False)
        T = experiences.steps
        v = 0 if experiences.terminals[-1] else self.get_critic_value(
            np.asarray(experiences.states)[None, -1], experiences.features[-1])
        vpred = np.asarray(experiences.values + [v])
        gamma = self.config["gamma"]
        lambda_ = self.config["gae_lambda"]
        gaelam = advantages = np.empty(T, 'float32')
        last_gaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - experiences.terminals[t]
            delta = experiences.rewards[t] + gamma * vpred[
                t + 1] * nonterminal - vpred[t]
            gaelam[
                t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam
        rs = advantages + experiences.values
        return experiences.states, experiences.actions, advantages, rs, experiences.features

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        config = self.config
        n_updates = 0
        for _ in range(int(config["n_iter"])):
            # Collect trajectories until we get timesteps_per_batch total timesteps
            states, actions, advs, rs, _ = self.get_processed_trajectories()
            advs = np.array(advs)
            normalized_advs = (advs - advs.mean()) / advs.std()
            self.session.run(self.set_old_to_new)

            indices = np.arange(len(states))
            for _ in range(int(self.config["n_epochs"])):
                np.random.shuffle(indices)

                batch_size = int(self.config["batch_size"])
                for j in range(0, len(states), batch_size):
                    batch_indices = indices[j:(j + batch_size)]
                    batch_states = np.array(states)[batch_indices]
                    batch_actions = np.array(actions)[batch_indices]
                    batch_advs = np.array(normalized_advs)[batch_indices]
                    batch_rs = np.array(rs)[batch_indices]
                    fetches = [self.train_op]
                    if (n_updates % 1000) == 0:
                        fetches.append(self.model_summary_op)
                    feed_dict = {
                        self.states: batch_states,
                        self.old_network.states: batch_states,
                        self.actions_taken: batch_actions,
                        self.old_network.actions_taken: batch_actions,
                        self.advantage: batch_advs,
                        self.ret: batch_rs
                    }
                    results = self.session.run(fetches, feed_dict)
                    if (n_updates % 1000) == 0:
                        self.writer.add_summary(results[-1], n_updates)
                    n_updates += 1
                self.writer.flush()

            if self.config["save_model"]:
                self.saver.save(self.session,
                                os.path.join(self.monitor_path, "model"))
    def __init__(self,
                 envs,
                 monitor_path,
                 learning_method="REINFORCE",
                 **usercfg):
        super(AsyncKnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.learning_method = learning_method
        self.monitor_path = monitor_path
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=200,
                switch_at_iter=
                None,  # None to deactivate, otherwhise an iteration at which to switch
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_task_variations=3,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.stop_requested = False

        self.session = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           initializer=tf.constant_initializer(
                                               0, dtype=tf.int32),
                                           trainable=False)
        self.build_networks()

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.reward = tf.placeholder("float", name="reward")
        summary_rewards = tf.summary.scalar("Reward", self.reward)
        self.episode_length = tf.placeholder("float", name="episode_length")
        summary_episode_lengths = tf.summary.scalar("Episode_length",
                                                    self.episode_length)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.jobs = []
        for i, env in enumerate(self.envs):
            self.jobs.append(
                self.make_thread(
                    env,
                    i,
                    self.config["switch_at_iter"]
                    if self.config["switch_at_iter"] is not None
                    and i != len(self.envs) - 1 else self.config["n_iter"],
                    start_at_iter=(0 if self.config["switch_at_iter"] is None
                                   or i != len(self.envs) - 1 else
                                   self.config["switch_at_iter"])))

        for i, job in enumerate(self.jobs):
            only_sparse = (self.config["switch_at_iter"] is not None
                           and i == len(self.jobs) - 1)
            grads = tf.gradients(
                job.loss, (self.shared_vars if not (only_sparse) else []) +
                [job.sparse_representation])
            job.apply_grad = job.optimizer.apply_gradients(
                zip(grads, (self.shared_vars if not (only_sparse) else []) +
                    [job.sparse_representation]),
                global_step=self.global_step)

        self.session.run(tf.global_variables_initializer())

        if self.config["save_model"]:
            for job in self.jobs:
                tf.add_to_collection("action", job.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
class AsyncKnowledgeTransfer(Agent):
    """Asynchronous learner for variations of a task."""
    def __init__(self,
                 envs,
                 monitor_path,
                 learning_method="REINFORCE",
                 **usercfg):
        super(AsyncKnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.learning_method = learning_method
        self.monitor_path = monitor_path
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=200,
                switch_at_iter=
                None,  # None to deactivate, otherwhise an iteration at which to switch
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_task_variations=3,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.stop_requested = False

        self.session = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           initializer=tf.constant_initializer(
                                               0, dtype=tf.int32),
                                           trainable=False)
        self.build_networks()

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.reward = tf.placeholder("float", name="reward")
        summary_rewards = tf.summary.scalar("Reward", self.reward)
        self.episode_length = tf.placeholder("float", name="episode_length")
        summary_episode_lengths = tf.summary.scalar("Episode_length",
                                                    self.episode_length)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.jobs = []
        for i, env in enumerate(self.envs):
            self.jobs.append(
                self.make_thread(
                    env,
                    i,
                    self.config["switch_at_iter"]
                    if self.config["switch_at_iter"] is not None
                    and i != len(self.envs) - 1 else self.config["n_iter"],
                    start_at_iter=(0 if self.config["switch_at_iter"] is None
                                   or i != len(self.envs) - 1 else
                                   self.config["switch_at_iter"])))

        for i, job in enumerate(self.jobs):
            only_sparse = (self.config["switch_at_iter"] is not None
                           and i == len(self.jobs) - 1)
            grads = tf.gradients(
                job.loss, (self.shared_vars if not (only_sparse) else []) +
                [job.sparse_representation])
            job.apply_grad = job.optimizer.apply_gradients(
                zip(grads, (self.shared_vars if not (only_sparse) else []) +
                    [job.sparse_representation]),
                global_step=self.global_step)

        self.session.run(tf.global_variables_initializer())

        if self.config["save_model"]:
            for job in self.jobs:
                tf.add_to_collection("action", job.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_networks(self):
        with tf.variable_scope("shared"):
            self.states = tf.placeholder(
                tf.float32,
                [None] + list(self.envs[0].observation_space.shape),
                name="states")
            self.action_taken = tf.placeholder(tf.float32, name="action_taken")
            self.advantage = tf.placeholder(tf.float32, name="advantage")

            if self.config["feature_extraction"]:
                self.L1 = tf.contrib.layers.fully_connected(
                    inputs=self.states,
                    num_outputs=self.config["n_hidden_units"],
                    activation_fn=tf.tanh,
                    weights_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.02),
                    biases_initializer=tf.zeros_initializer(),
                    scope="L1")
            else:
                self.L1 = self.states
            self.knowledge_base = tf.Variable(tf.truncated_normal(
                [self.L1.get_shape()[-1].value, self.config["n_sparse_units"]],
                mean=0.0,
                stddev=0.02),
                                              name="knowledge_base")

            self.shared_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

    def signal_handler(self, signal, frame):
        """When a (SIGINT) signal is received, request the threads (via the master) to stop after completing an iteration."""
        logging.info("SIGINT signal received: Requesting a stop...")
        self.stop_requested = True

    def learn(self):
        signal.signal(signal.SIGINT, self.signal_handler)
        if self.config["switch_at_iter"] is None:
            idx = None
        else:
            idx = -1
        for job in self.jobs[:idx]:
            job.start()
        for job in self.jobs[:idx]:
            job.join()
        try:
            self.jobs[idx].start()
            self.jobs[idx].join()
        except TypeError:  # idx is None
            pass

        if self.config["save_model"]:
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))

    def make_thread(self, env, task_id, n_iter, start_at_iter=0):
        return AKTThread(self,
                         env,
                         task_id,
                         n_iter,
                         start_at_iter=start_at_iter)
Beispiel #13
0
class KarpathyCNN(Agent):
    """Karpathy policy gradient learner using a convolutional neural network"""
    def __init__(self, env, monitor_path, video=True, **usercfg):
        super(KarpathyCNN, self).__init__(**usercfg)
        self.env = wrappers.Monitor(env,
                                    monitor_path,
                                    force=True,
                                    video_callable=(None if video else False))
        self.nA = env.action_space.n
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                # timesteps_per_batch=10000,
                # n_iter=100,
                n_hidden_units=200,
                learning_rate=1e-3,
                batch_size=
                10,  # Amount of episodes after which to adapt gradients
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.99,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                draw_frequency=50  # Draw a plot every 50 episodes
            ))
        self.config.update(usercfg)
        self.build_network()
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_network(self):
        image_size = 80
        image_depth = 1  # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images

        self.states = tf.placeholder(
            tf.float32, [None, image_size, image_size, image_depth],
            name="states")

        # Convolution layer 1
        depth = 32
        patch_size = 4
        self.w1 = tf.Variable(
            tf.truncated_normal([patch_size, patch_size, image_depth, depth],
                                stddev=0.01))
        self.b1 = tf.Variable(tf.zeros([depth]))
        self.L1 = tf.nn.relu(
            tf.nn.conv2d(
                self.states, self.w1, strides=[1, 2, 2, 1], padding="SAME") +
            self.b1)
        self.L1 = tf.nn.max_pool(self.L1,
                                 ksize=[1, 2, 2, 1],
                                 strides=[1, 2, 2, 1],
                                 padding="SAME")

        # Convolution layer 2
        self.w2 = tf.Variable(
            tf.truncated_normal([patch_size, patch_size, depth, depth],
                                stddev=0.01))
        self.b2 = tf.Variable(tf.zeros([depth]))
        self.L2 = tf.nn.relu(
            tf.nn.conv2d(
                self.L1, self.w2, strides=[1, 2, 2, 1], padding="SAME") +
            self.b2)

        # Flatten
        shape = self.L2.get_shape().as_list()
        reshape = tf.reshape(self.L2, [-1, shape[1] * shape[2] * shape[3]
                                       ])  # -1 for the (unknown) batch size

        # Fully connected layer 1
        self.L3 = tf.contrib.layers.fully_connected(
            inputs=reshape,
            num_outputs=self.config["n_hidden_units"],
            activation_fn=tf.nn.relu,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())

        # Fully connected layer 2
        self.probs = tf.contrib.layers.fully_connected(
            inputs=self.L3,
            num_outputs=self.nA,
            activation_fn=tf.nn.softmax,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())

        self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                 name="action")

        self.vars = [
            self.w1, self.b1, self.w2, self.b2, self.w3, self.b3, self.w4,
            self.b4
        ]

        self.action_taken = tf.placeholder(tf.float32,
                                           shape=[None, self.nA],
                                           name="action_taken")
        self.feedback = tf.placeholder(tf.float32,
                                       shape=[None, self.nA],
                                       name="feedback")
        loss = tf.reduce_mean(
            tf.squared_difference(self.action_taken, self.probs) *
            self.feedback)

        self.create_accumulative_grads = create_accumulative_gradients_op(
            self.vars)
        self.accumulate_grads = add_accumulative_gradients_op(
            self.vars, self.create_accumulative_grads, loss)
        self.reset_accumulative_grads = reset_accumulative_gradients_op(
            self.vars, self.create_accumulative_grads)

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])

        self.apply_gradients = self.optimizer.apply_gradients(
            zip(self.create_accumulative_grads, self.vars))

        init = tf.global_variables_initializer()

        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.session.run(init)

    def choose_action(self, state):
        return self.session.run([self.action],
                                feed_dict={self.states: [state]})[0]

    def get_trajectory(self, render=False):
        """
        Run agent-environment loop for one whole episode (trajectory)
        Return dictionary of results
        Note that this function returns more than the get_trajectory in the EnvRunner class.
        """
        state = preprocess_image(self.env.reset())
        prev_state = state
        states = []
        actions = []
        rewards = []
        for _ in range(self.config["episode_max_length"]):
            delta = state - prev_state
            action = self.choose_action(delta)
            states.append(delta)
            prev_state = state
            state, rew, done, _ = self.env.step(action)
            state = preprocess_image(state)
            actions.append(action)
            rewards.append(rew)
            if done:
                break
            if render:
                self.env.render()
        return {
            "reward": np.array(rewards),
            "state": np.array(states),
            "action": np.array(actions),
        }

    def learn(self):
        reporter = Reporter()

        self.session.run([self.reset_accumulative_grads])

        iteration = 0  # amount of batches processed
        episode_nr = 0
        episode_lengths = np.zeros(self.config["batch_size"])
        episode_rewards = np.zeros(self.config["batch_size"])
        mean_rewards = []
        while True:  # Keep executing episodes
            trajectory = self.get_trajectory()

            episode_rewards[episode_nr % self.config["batch_size"]] = sum(
                trajectory["reward"])
            episode_lengths[episode_nr % self.config["batch_size"]] = len(
                trajectory["reward"])
            episode_nr += 1
            action_taken = (np.arange(
                self.nA) == trajectory["action"][:, None]).astype(
                    np.float32)  # one-hot encoding

            discounted_episode_rewards = discount_rewards(
                trajectory["reward"], self.config["gamma"])
            # standardize
            discounted_episode_rewards -= np.mean(discounted_episode_rewards)
            std = np.std(discounted_episode_rewards)
            std = std if std > 0 else 1
            discounted_episode_rewards /= std
            feedback = np.reshape(
                np.repeat(discounted_episode_rewards, self.nA),
                (len(discounted_episode_rewards), self.nA))

            self.session.run(
                [self.accumulate_grads],
                feed_dict={
                    self.states: trajectory["state"],
                    self.action_taken: action_taken,
                    self.feedback: feedback
                })
            if episode_nr % self.config["batch_size"] == 0:  # batch is done
                iteration += 1
                self.session.run([self.apply_gradients])
                self.session.run([self.reset_accumulative_grads])
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths, episode_nr)
                mean_rewards.append(episode_rewards.mean())
                if episode_nr % self.config["draw_frequency"] == 0:
                    reporter.draw_rewards(mean_rewards)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))
Beispiel #14
0
class KnowledgeTransfer(Agent):
    """Learner for variations of a task."""
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=100,
                switch_at_iter=None,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [
            EnvRunner(envs[i], TaskPolicy(action, self), self.config)
            for i, action in enumerate(self.action_tensors)
        ]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

    def build_networks(self):
        self.session = tf.Session()

        with tf.variable_scope("shared"):
            self.states = tf.placeholder(
                tf.float32,
                [None] + list(self.envs[0].observation_space.shape),
                name="states")
            self.action_taken = tf.placeholder(tf.float32, name="action_taken")
            self.advantage = tf.placeholder(tf.float32, name="advantage")

            L1 = None
            if self.config["feature_extraction"]:
                L1 = tf.contrib.layers.fully_connected(
                    inputs=self.states,
                    num_outputs=self.config["n_hidden_units"],
                    activation_fn=tf.tanh,
                    weights_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.02),
                    biases_initializer=tf.zeros_initializer(),
                    scope="L1")
            else:
                L1 = self.states

            knowledge_base = tf.Variable(tf.truncated_normal(
                [L1.get_shape()[-1].value, self.config["n_sparse_units"]],
                mean=0.0,
                stddev=0.02),
                                         name="knowledge_base")

            self.shared_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="shared")

        # Every task has its own (sparse) representation
        sparse_representations = [
            tf.Variable(tf.truncated_normal(
                [self.config["n_sparse_units"], self.nA],
                mean=0.0,
                stddev=0.02),
                        name="sparse%d" % i) for i in range(self.n_tasks)
        ]

        self.probs_tensors = [
            tf.nn.softmax(tf.matmul(L1, tf.matmul(knowledge_base, s)))
            for s in sparse_representations
        ]
        self.action_tensors = [
            tf.squeeze(tf.multinomial(tf.log(probs), 1))
            for probs in self.probs_tensors
        ]

        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.config["learning_rate"],
            decay=self.config["decay"],
            epsilon=self.config["epsilon"])
        net_vars = self.shared_vars + sparse_representations
        self.accum_grads = create_accumulative_gradients_op(net_vars, 0)

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.rewards = tf.placeholder("float", name="Rewards")
        summary_rewards = tf.summary.scalar("Reward", self.rewards)
        self.episode_lengths = tf.placeholder("float", name="Episode_lengths")
        summary_episode_lengths = tf.summary.scalar("Length",
                                                    self.episode_lengths)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.writers = []
        self.losses = []

        regularizer = tf.contrib.layers.l1_regularizer(.05)
        for i, probabilities in enumerate(self.probs_tensors):
            good_probabilities = tf.reduce_sum(tf.multiply(
                probabilities,
                tf.one_hot(tf.cast(self.action_taken, tf.int32), self.nA)),
                                               reduction_indices=[1])
            eligibility = tf.log(good_probabilities) * self.advantage
            loss = -tf.reduce_sum(eligibility) + regularizer(
                sparse_representations[i])
            self.losses.append(loss)
            writer = tf.summary.FileWriter(
                os.path.join(self.monitor_path, "task" + str(i)),
                self.session.graph)
            self.writers.append(writer)

        # An add op for every task & its loss
        self.add_accum_grads = []
        for i, loss in enumerate(self.losses):
            # Use all variables if the switch tasks experiment is disactivated or it's not the last task
            all_vars = self.config["switch_at_iter"] is None or i != len(
                self.losses) - 1
            self.add_accum_grads.append(
                add_accumulative_gradients_op(
                    (self.shared_vars if all_vars else []) +
                    [sparse_representations[i]],
                    ([self.accum_grads[0]] if all_vars else []) +
                    [self.accum_grads[i + 1]], loss, i))

        self.apply_gradients = self.optimizer.apply_gradients(
            zip(self.accum_grads, net_vars))
        self.reset_accum_grads = reset_accumulative_gradients_op(
            net_vars, self.accum_grads, 0)

        self.init_op = tf.global_variables_initializer()

    def _initialize(self):
        self.session.run(self.init_op)

    def learn(self):
        """Run learning algorithm"""
        self._initialize()
        reporter = Reporter()
        config = self.config
        total_n_trajectories = np.zeros(len(self.envs))
        for iteration in range(config["n_iter"]):
            self.session.run([self.reset_accum_grads])
            for i, task_runner in enumerate(self.task_runners):
                if self.config["switch_at_iter"] is not None:
                    if iteration >= self.config["switch_at_iter"] and i != (
                            len(self.task_runners) - 1):
                        continue
                    elif iteration < self.config["switch_at_iter"] and i == len(
                            self.task_runners) - 1:
                        continue
                # Collect trajectories until we get timesteps_per_batch total timesteps
                trajectories = task_runner.get_trajectories()
                total_n_trajectories[i] += len(trajectories)
                all_state = np.concatenate(
                    [trajectory["state"] for trajectory in trajectories])
                # Compute discounted sums of rewards
                rets = [
                    discount_rewards(trajectory["reward"], config["gamma"])
                    for trajectory in trajectories
                ]
                max_len = max(len(ret) for ret in rets)
                padded_rets = [
                    np.concatenate([ret, np.zeros(max_len - len(ret))])
                    for ret in rets
                ]
                # Compute time-dependent baseline
                baseline = np.mean(padded_rets, axis=0)
                # Compute advantage function
                advs = [ret - baseline[:len(ret)] for ret in rets]
                all_action = np.concatenate(
                    [trajectory["action"] for trajectory in trajectories])
                all_adv = np.concatenate(advs)
                # Do policy gradient update step
                episode_rewards = np.array([
                    trajectory["reward"].sum() for trajectory in trajectories
                ])  # episode total rewards
                episode_lengths = np.array([
                    len(trajectory["reward"]) for trajectory in trajectories
                ])  # episode lengths
                results = self.session.run(
                    [
                        self.losses[i], self.add_accum_grads[i],
                        self.accum_grads
                    ],
                    feed_dict={
                        self.states: all_state,
                        self.action_taken: all_action,
                        self.advantage: all_adv
                    })
                summary = self.session.run(
                    [self.summary_op],
                    feed_dict={
                        self.loss: results[0],
                        self.rewards: np.mean(episode_rewards),
                        self.episode_lengths: np.mean(episode_lengths)
                    })

                self.writers[i].add_summary(summary[0], iteration)
                self.writers[i].flush()
                print("Task:", i)
                reporter.print_iteration_stats(iteration, episode_rewards,
                                               episode_lengths,
                                               total_n_trajectories[i])

            # Apply accumulated gradient after all the gradients of each task are summed
            self.session.run([self.apply_gradients])

        if self.config["save_model"]:
            if not os.path.exists(self.monitor_path):
                os.makedirs(self.monitor_path)
            self.saver.save(self.session,
                            os.path.join(self.monitor_path, "model"))