Esempio n. 1
0
class DQN(Algorithm):
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
        self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)

    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for _ in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.dqn_graph.act(
                self.sess,
                np.array(self.obs)[None],
                self.exploration.value(self.num_timesteps))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"],
                        beta=self.beta_schedule.value(self.num_timesteps))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = (
                        self.replay_buffer.sample(config["batch_size"]))
                    batch_idxes = None
                td_errors = self.dqn_graph.train(self.sess, obses_t, actions,
                                                 rewards, obses_tp1, dones,
                                                 np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = np.abs(td_errors) + (
                        config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if self.num_timesteps > config["learning_starts"] and (
                    self.num_timesteps % config["target_network_update_freq"]
                    == 0):
                # Update target network periodically.
                self.dqn_graph.update_target(self.sess)

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration":
            int(100 * self.exploration.value(self.num_timesteps))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("buffer_size", len(self.replay_buffer))
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular(
            "% time spent exploring",
            int(100 * self.exploration.value(self.num_timesteps)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
Esempio n. 2
0
class DQNReplayEvaluator(DQNEvaluator):
    """Wraps DQNEvaluators to provide replay buffer functionality.

    This has two modes:
        If config["num_workers"] == 1:
            Samples will be collected locally.
        If config["num_workers"] > 1:
            Samples will be collected from a number of remote workers.
    """
    def __init__(self, env_creator, config, logdir):
        DQNEvaluator.__init__(self, env_creator, config, logdir)

        # Create extra workers if needed
        if self.config["num_workers"] > 1:
            remote_cls = ray.remote(num_cpus=1)(DQNEvaluator)
            self.workers = [
                remote_cls.remote(env_creator, config, logdir)
                for _ in range(self.config["num_workers"])
            ]
        else:
            self.workers = []

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None

        self.samples_to_prioritize = None

    def sample(self, no_replay=False):
        # First seed the replay buffer with a few new samples
        if self.workers:
            weights = ray.put(self.get_weights())
            for w in self.workers:
                w.set_weights.remote(weights)
            samples = ray.get([w.sample.remote() for w in self.workers])
        else:
            samples = [DQNEvaluator.sample(self)]

        for s in samples:
            for row in s.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"])

        if no_replay:
            return samples

        # Then return a batch sampled from the buffer
        if self.config["prioritized_replay"]:
            (obses_t, actions, rewards, obses_tp1, dones, weights,
             batch_indexes) = self.replay_buffer.sample(
                 self.config["train_batch_size"],
                 beta=self.beta_schedule.value(self.global_timestep))
            self._update_priorities_if_needed()
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })
            self.samples_to_prioritize = batch
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": np.ones_like(rewards)
            })
        return batch

    def compute_gradients(self, samples):
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(samples["batch_indexes"],
                                                 new_priorities)
            self.samples_to_prioritize = None
        return grad

    def _update_priorities_if_needed(self):
        """Manually updates replay buffer priorities on the last batch.

        Note that this is only needed when not computing gradients on this
        Evaluator (e.g. when using local multi-GPU). Otherwise, priorities
        can be updated more efficiently as part of computing gradients.
        """

        if not self.samples_to_prioritize:
            return

        batch = self.samples_to_prioritize
        td_errors = self.dqn_graph.compute_td_error(
            self.sess, batch["obs"], batch["actions"], batch["rewards"],
            batch["new_obs"], batch["dones"], batch["weights"])

        new_priorities = (np.abs(td_errors) +
                          self.config["prioritized_replay_eps"])
        self.replay_buffer.update_priorities(batch["batch_indexes"],
                                             new_priorities)
        self.samples_to_prioritize = None

    def stats(self):
        if self.workers:
            return ray.get([s.stats.remote() for s in self.workers])
        else:
            return DQNEvaluator.stats(self)

    def save(self):
        return [
            DQNEvaluator.save(self),
            ray.get([w.save.remote() for w in self.workers]),
            self.beta_schedule, self.replay_buffer
        ]

    def restore(self, data):
        DQNEvaluator.restore(self, data[0])
        for (w, d) in zip(self.workers, data[1]):
            w.restore.remote(d)
        self.beta_schedule = data[2]
        self.replay_buffer = data[3]
Esempio n. 3
0
File: dqn.py Progetto: xgong/ray
class DQN(Algorithm):
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                  hiddens=[256],
                                  dueling=True)
        sess = U.make_session(num_cpu=config["num_cpu"])
        sess.__enter__()

        def make_obs_ph(name):
            return U.BatchInput(env.observation_space.shape, name=name)

        self.act, self.optimize, self.update_target, self.debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10)
        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0

    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for t in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.act(np.array(self.obs)[None],
                              update_eps=self.exploration.value(t))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"], beta=self.beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = \
                        self.replay_buffer.sample(config["batch_size"])
                    batch_idxes = None
                td_errors = self.optimize(obses_t, actions, rewards, obses_tp1,
                                          dones, np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = (np.abs(td_errors) +
                                      config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if (self.num_timesteps > config["learning_starts"]
                    and self.num_timesteps %
                    config["target_network_update_freq"] == 0):
                # Update target network periodically.
                self.update_target()

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration": int(100 * self.exploration.value(t))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * self.exploration.value(t)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
Esempio n. 4
0
class Actor(object):
    def __init__(self, env_name, config, logdir):
        env = gym.make(env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        self.config = config

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_tp1, self.dqn_graph.q_t), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)

    def step(self, cur_timestep):
        # Take action and update exploration to the newest value
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(cur_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        return ret

    def do_steps(self, num_steps, cur_timestep):
        for _ in range(num_steps):
            obs, action, rew, new_obs, done = self.step(cur_timestep)
            self.replay_buffer.add(obs, action, rew, new_obs, done)

    def get_gradient(self, cur_timestep):
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["train_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch_idxes = None
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, obses_t, actions, rewards, obses_tp1, dones,
            np.ones_like(rewards))
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        return grad

    def apply_gradients(self, grad):
        self.dqn_graph.apply_gradients(self.sess, grad)

    def stats(self, num_timesteps):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        exploration = self.exploration.value(num_timesteps)
        return (mean_100ep_reward,
                mean_100ep_length, len(self.episode_rewards), exploration,
                len(self.replay_buffer))

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def save(self):
        return [
            self.beta_schedule, self.exploration, self.episode_rewards,
            self.episode_lengths, self.saved_mean_reward, self.obs
        ]

    def restore(self, data):
        self.beta_schedule = data[0]
        self.exploration = data[1]
        self.episode_rewards = data[2]
        self.episode_lengths = data[3]
        self.saved_mean_reward = data[4]
        self.obs = data[5]
Esempio n. 5
0
class DQNEvaluator(TFMultiGPUSupport):
    """The base DQN Evaluator that does not include the replay buffer.

    TODO(rliaw): Support observation/reward filters?"""
    def __init__(self, registry, env_creator, config, logdir):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()

    def set_global_timestep(self, global_timestep):
        self.global_timestep = global_timestep

    def update_target(self):
        self.dqn_graph.update_target(self.sess)

    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(self.config["sample_batch_size"] +
                       self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
                         actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs,
            "actions": actions,
            "rewards": rewards,
            "new_obs": new_obs,
            "dones": dones,
            "weights": np.ones_like(rewards)
        })
        assert batch.count == self.config["sample_batch_size"]
        return batch

    def compute_gradients(self, samples):
        _, grad = self.dqn_graph.compute_gradients(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        return grad

    def apply_gradients(self, grads):
        self.dqn_graph.apply_gradients(self.sess, grads)

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def tf_loss_inputs(self):
        return self.dqn_graph.loss_inputs

    def build_tf_loss(self, input_placeholders):
        return self.dqn_graph.build_loss(*input_placeholders)

    def _step(self, global_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(global_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        self.local_timestep += 1
        return ret

    def stats(self):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5)
        exploration = self.exploration.value(self.global_timestep)
        return {
            "mean_100ep_reward": mean_100ep_reward,
            "mean_100ep_length": mean_100ep_length,
            "num_episodes": len(self.episode_rewards),
            "exploration": exploration,
            "local_timestep": self.local_timestep,
        }

    def save(self):
        return [
            self.exploration, self.episode_rewards, self.episode_lengths,
            self.saved_mean_reward, self.obs
        ]

    def restore(self, data):
        self.exploration = data[0]
        self.episode_rewards = data[1]
        self.episode_lengths = data[2]
        self.saved_mean_reward = data[3]
        self.obs = data[4]
Esempio n. 6
0
class DQNEvaluator(PolicyEvaluator):
    """The DQN Evaluator.

    TODO(rliaw): Support observation/reward filters?"""

    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                0.4 ** (
                    1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(
                    config["exploration_fraction"] *
                    config["schedule_max_timesteps"]),
                initial_p=1.0,
                final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()

    def set_global_timestep(self, global_timestep):
        self.global_timestep = global_timestep

    def update_target(self):
        self.dqn_graph.update_target(self.sess)

    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
            "weights": np.ones_like(rewards)})
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"],
                new_obs, batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch

    def compute_gradients(self, samples):
        td_err, grads = self.dqn_graph.compute_gradients(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        return grads, {"td_error": td_err}

    def apply_gradients(self, grads):
        self.dqn_graph.apply_gradients(self.sess, grads)

    def compute_apply(self, samples):
        td_error = self.dqn_graph.compute_apply(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        return {"td_error": td_error}

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def _step(self, global_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(
            self.sess, np.array(self.obs)[None],
            self.exploration.value(global_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        self.local_timestep += 1
        return ret

    def stats(self):
        n = self.config["smoothing_num_episodes"] + 1
        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
        exploration = self.exploration.value(self.global_timestep)
        return {
            "mean_100ep_reward": mean_100ep_reward,
            "mean_100ep_length": mean_100ep_length,
            "num_episodes": len(self.episode_rewards),
            "exploration": exploration,
            "local_timestep": self.local_timestep,
        }

    def save(self):
        return [
            self.exploration,
            self.episode_rewards,
            self.episode_lengths,
            self.saved_mean_reward,
            self.obs,
            self.global_timestep,
            self.local_timestep]

    def restore(self, data):
        self.exploration = data[0]
        self.episode_rewards = data[1]
        self.episode_lengths = data[2]
        self.saved_mean_reward = data[3]
        self.obs = data[4]
        self.global_timestep = data[5]
        self.local_timestep = data[6]
Esempio n. 7
0
File: dqn.py Progetto: zcli/ray
class Actor(object):
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        env = wrap_dqn(env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config, logdir)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.set_weights_time = RunningStat(())
        self.sample_time = RunningStat(())
        self.grad_time = RunningStat(())

        # Note that workers don't need target vars to be synced
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)

    def step(self, cur_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(cur_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        return ret

    def do_steps(self, num_steps, cur_timestep, store):
        """Takes N steps.

        If store is True, the steps will be stored in the local replay buffer.
        Otherwise, the steps will be returned.
        """

        output = []
        for _ in range(num_steps):
            result = self.step(cur_timestep)
            if store:
                obs, action, rew, new_obs, done = result
                self.replay_buffer.add(obs, action, rew, new_obs, done)
            else:
                output.append(result)
        if not store:
            return output

    def do_multi_gpu_optimize(self, cur_timestep):
        """Performs N iters of multi-gpu SGD over the local replay buffer."""
        dt = time.time()
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["train_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch_idxes = None
        replay_buffer_read_time = (time.time() - dt)
        dt = time.time()
        tuples_per_device = self.dqn_graph.multi_gpu_optimizer.load_data(
            self.sess, [
                obses_t, actions, rewards, obses_tp1, dones,
                np.ones_like(rewards)
            ])
        per_device_batch_size = (
            self.dqn_graph.multi_gpu_optimizer.per_device_batch_size)
        num_batches = (int(tuples_per_device) // int(per_device_batch_size))
        data_load_time = (time.time() - dt)
        dt = time.time()
        for _ in range(self.config["num_sgd_iter"]):
            batches = list(range(num_batches))
            np.random.shuffle(batches)
            for i in batches:
                self.dqn_graph.multi_gpu_optimizer.optimize(
                    self.sess, i * per_device_batch_size)
        sgd_time = (time.time() - dt)
        dt = time.time()
        if self.config["prioritized_replay"]:
            dt = time.time()
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obses_t, actions, rewards, obses_tp1, dones,
                np.ones_like(rewards))
            dt = time.time()
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        prioritization_time = (time.time() - dt)
        return {
            "replay_buffer_read_time": replay_buffer_read_time,
            "data_load_time": data_load_time,
            "sgd_time": sgd_time,
            "prioritization_time": prioritization_time,
        }

    def do_async_step(self, worker_id, cur_timestep, params, gradient_id):
        """Takes steps and returns grad to apply async in the driver."""
        dt = time.time()
        self.set_weights(params)
        self.set_weights_time.push(time.time() - dt)
        dt = time.time()
        self.do_steps(self.config["sample_batch_size"],
                      cur_timestep,
                      store=True)
        self.sample_time.push(time.time() - dt)
        if (cur_timestep > self.config["learning_starts"]
                and len(self.replay_buffer) > self.config["train_batch_size"]):
            dt = time.time()
            gradient = self.sample_buffer_gradient(cur_timestep)
            self.grad_time.push(time.time() - dt)
        else:
            gradient = None
        return gradient, {"id": worker_id, "gradient_id": gradient_id}

    def sample_buffer_gradient(self, cur_timestep):
        """Returns grad over a batch sampled from the local replay buffer."""
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["sgd_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["sgd_batch_size"])
            batch_idxes = None
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, obses_t, actions, rewards, obses_tp1, dones,
            np.ones_like(rewards))
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        return grad

    def apply_gradients(self, grad):
        self.dqn_graph.apply_gradients(self.sess, grad)

    # TODO(ekl) return a dictionary and use that everywhere to clean up the
    # bookkeeping of stats
    def stats(self, num_timesteps):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5)
        exploration = self.exploration.value(num_timesteps)
        return (mean_100ep_reward,
                mean_100ep_length, len(self.episode_rewards), exploration,
                len(self.replay_buffer), float(self.set_weights_time.mean),
                float(self.sample_time.mean), float(self.grad_time.mean))

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def save(self):
        return [
            self.beta_schedule, self.exploration, self.episode_rewards,
            self.episode_lengths, self.saved_mean_reward, self.obs,
            self.replay_buffer
        ]

    def restore(self, data):
        self.beta_schedule = data[0]
        self.exploration = data[1]
        self.episode_rewards = data[2]
        self.episode_lengths = data[3]
        self.saved_mean_reward = data[4]
        self.obs = data[5]
        self.replay_buffer = data[6]
Esempio n. 8
0
class DQNEvaluator(Evaluator):
    """The DQN Evaluator.

    TODO(rliaw): Support observation/reward filters?"""

    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                0.4 ** (
                    1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(
                    config["exploration_fraction"] *
                    config["schedule_max_timesteps"]),
                initial_p=1.0,
                final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()

    def set_global_timestep(self, global_timestep):
        self.global_timestep = global_timestep

    def update_target(self):
        self.dqn_graph.update_target(self.sess)

    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            ob, act, rew, ob1, done = self._step(self.global_timestep)
            obs.append(ob)
            actions.append(act)
            rewards.append(rew)
            new_obs.append(ob1)
            dones.append(done)

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": [pack(np.array(o)) for o in obs], "actions": actions,
            "rewards": rewards,
            "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
            "weights": np.ones_like(rewards)})
        assert (batch.count == self.config["sample_batch_size"])

        # Prioritize on the worker side
        if self.config["worker_side_prioritization"]:
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obs, batch["actions"], batch["rewards"],
                new_obs, batch["dones"], batch["weights"])
            new_priorities = (
                np.abs(td_errors) + self.config["prioritized_replay_eps"])
            batch.data["weights"] = new_priorities

        return batch

    def compute_apply(self, samples):
        if samples is None:
            return None
        td_error = self.dqn_graph.compute_apply(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        return td_error

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def _step(self, global_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(
            self.sess, np.array(self.obs)[None],
            self.exploration.value(global_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        self.local_timestep += 1
        return ret

    def stats(self):
        n = self.config["smoothing_num_episodes"] + 1
        mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
        exploration = self.exploration.value(self.global_timestep)
        return {
            "mean_100ep_reward": mean_100ep_reward,
            "mean_100ep_length": mean_100ep_length,
            "num_episodes": len(self.episode_rewards),
            "exploration": exploration,
            "local_timestep": self.local_timestep,
        }

    def save(self):
        return [
            self.exploration,
            self.episode_rewards,
            self.episode_lengths,
            self.saved_mean_reward,
            self.obs,
            self.global_timestep,
            self.local_timestep]

    def restore(self, data):
        self.exploration = data[0]
        self.episode_rewards = data[1]
        self.episode_lengths = data[2]
        self.saved_mean_reward = data[3]
        self.obs = data[4]
        self.global_timestep = data[5]
        self.local_timestep = data[6]
Esempio n. 9
0
class DQNEvaluator(TFMultiGPUSupport):
    """The base DQN Evaluator that does not include the replay buffer."""
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        env = wrap_dqn(env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config, logdir)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()

    def set_global_timestep(self, global_timestep):
        self.global_timestep = global_timestep

    def update_target(self):
        self.dqn_graph.update_target(self.sess)

    def sample(self):
        output = []
        for _ in range(self.config["sample_batch_size"]):
            result = self._step(self.global_timestep)
            output.append(result)
        return output

    def compute_gradients(self, samples):
        if self.config["prioritized_replay"]:
            obses_t, actions, rewards, obses_tp1, dones, _ = samples
        else:
            obses_t, actions, rewards, obses_tp1, dones = samples
        _, grad = self.dqn_graph.compute_gradients(self.sess, obses_t, actions,
                                                   rewards, obses_tp1, dones,
                                                   np.ones_like(rewards))
        return grad

    def apply_gradients(self, grads):
        self.dqn_graph.apply_gradients(self.sess, grads)

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def tf_loss_inputs(self):
        return self.dqn_graph.loss_inputs

    def build_tf_loss(self, input_placeholders):
        return self.dqn_graph.build_loss(*input_placeholders)

    def _step(self, global_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(global_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        self.local_timestep += 1
        return ret

    def stats(self):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5)
        exploration = self.exploration.value(self.global_timestep)
        return {
            "mean_100ep_reward": mean_100ep_reward,
            "mean_100ep_length": mean_100ep_length,
            "num_episodes": len(self.episode_rewards),
            "exploration": exploration,
            "local_timestep": self.local_timestep,
        }

    def save(self):
        return [
            self.exploration, self.episode_rewards, self.episode_lengths,
            self.saved_mean_reward, self.obs
        ]

    def restore(self, data):
        self.exploration = data[0]
        self.episode_rewards = data[1]
        self.episode_lengths = data[2]
        self.saved_mean_reward = data[3]
        self.obs = data[4]