Beispiel #1
0
    def __init__(self, env_creator, config, logdir):
        DQNEvaluator.__init__(self, env_creator, config, logdir)

        # Create extra workers if needed
        if self.config["num_workers"] > 1:
            remote_cls = ray.remote(num_cpus=1)(DQNEvaluator)
            self.workers = [
                remote_cls.remote(env_creator, config, logdir)
                for _ in range(self.config["num_workers"])
            ]
        else:
            self.workers = []

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None

        self.samples_to_prioritize = None
Beispiel #2
0
Datei: dqn.py Projekt: xgong/ray
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                  hiddens=[256],
                                  dueling=True)
        sess = U.make_session(num_cpu=config["num_cpu"])
        sess.__enter__()

        def make_obs_ph(name):
            return U.BatchInput(env.observation_space.shape, name=name)

        self.act, self.optimize, self.update_target, self.debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10)
        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
Beispiel #3
0
Datei: dqn.py Projekt: meikei/ray
    def _init(self):
        config = self.config
        env = gym.make(self.env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in self.env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(
                config["exploration_fraction"] *
                config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
        self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)
        self.saver = tf.train.Saver(max_to_keep=None)
Beispiel #4
0
Datei: dqn.py Projekt: zcli/ray
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        env = wrap_dqn(env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config, logdir)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.set_weights_time = RunningStat(())
        self.sample_time = RunningStat(())
        self.grad_time = RunningStat(())

        # Note that workers don't need target vars to be synced
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)
Beispiel #5
0
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env.spec.id:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        self.config = config

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_tp1, self.dqn_graph.q_t), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)
Beispiel #6
0
class DQNReplayEvaluator(DQNEvaluator):
    """Wraps DQNEvaluators to provide replay buffer functionality.

    This has two modes:
        If config["num_workers"] == 1:
            Samples will be collected locally.
        If config["num_workers"] > 1:
            Samples will be collected from a number of remote workers.
    """
    def __init__(self, env_creator, config, logdir):
        DQNEvaluator.__init__(self, env_creator, config, logdir)

        # Create extra workers if needed
        if self.config["num_workers"] > 1:
            remote_cls = ray.remote(num_cpus=1)(DQNEvaluator)
            self.workers = [
                remote_cls.remote(env_creator, config, logdir)
                for _ in range(self.config["num_workers"])
            ]
        else:
            self.workers = []

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None

        self.samples_to_prioritize = None

    def sample(self, no_replay=False):
        # First seed the replay buffer with a few new samples
        if self.workers:
            weights = ray.put(self.get_weights())
            for w in self.workers:
                w.set_weights.remote(weights)
            samples = ray.get([w.sample.remote() for w in self.workers])
        else:
            samples = [DQNEvaluator.sample(self)]

        for s in samples:
            for row in s.rows():
                self.replay_buffer.add(row["obs"], row["actions"],
                                       row["rewards"], row["new_obs"],
                                       row["dones"])

        if no_replay:
            return samples

        # Then return a batch sampled from the buffer
        if self.config["prioritized_replay"]:
            (obses_t, actions, rewards, obses_tp1, dones, weights,
             batch_indexes) = self.replay_buffer.sample(
                 self.config["train_batch_size"],
                 beta=self.beta_schedule.value(self.global_timestep))
            self._update_priorities_if_needed()
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": weights,
                "batch_indexes": batch_indexes
            })
            self.samples_to_prioritize = batch
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch = SampleBatch({
                "obs": obses_t,
                "actions": actions,
                "rewards": rewards,
                "new_obs": obses_tp1,
                "dones": dones,
                "weights": np.ones_like(rewards)
            })
        return batch

    def compute_gradients(self, samples):
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(samples["batch_indexes"],
                                                 new_priorities)
            self.samples_to_prioritize = None
        return grad

    def _update_priorities_if_needed(self):
        """Manually updates replay buffer priorities on the last batch.

        Note that this is only needed when not computing gradients on this
        Evaluator (e.g. when using local multi-GPU). Otherwise, priorities
        can be updated more efficiently as part of computing gradients.
        """

        if not self.samples_to_prioritize:
            return

        batch = self.samples_to_prioritize
        td_errors = self.dqn_graph.compute_td_error(
            self.sess, batch["obs"], batch["actions"], batch["rewards"],
            batch["new_obs"], batch["dones"], batch["weights"])

        new_priorities = (np.abs(td_errors) +
                          self.config["prioritized_replay_eps"])
        self.replay_buffer.update_priorities(batch["batch_indexes"],
                                             new_priorities)
        self.samples_to_prioritize = None

    def stats(self):
        if self.workers:
            return ray.get([s.stats.remote() for s in self.workers])
        else:
            return DQNEvaluator.stats(self)

    def save(self):
        return [
            DQNEvaluator.save(self),
            ray.get([w.save.remote() for w in self.workers]),
            self.beta_schedule, self.replay_buffer
        ]

    def restore(self, data):
        DQNEvaluator.restore(self, data[0])
        for (w, d) in zip(self.workers, data[1]):
            w.restore.remote(d)
        self.beta_schedule = data[2]
        self.replay_buffer = data[3]
Beispiel #7
0
class DQN(Algorithm):
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0
        self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)

    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for _ in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.dqn_graph.act(
                self.sess,
                np.array(self.obs)[None],
                self.exploration.value(self.num_timesteps))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"],
                        beta=self.beta_schedule.value(self.num_timesteps))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = (
                        self.replay_buffer.sample(config["batch_size"]))
                    batch_idxes = None
                td_errors = self.dqn_graph.train(self.sess, obses_t, actions,
                                                 rewards, obses_tp1, dones,
                                                 np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = np.abs(td_errors) + (
                        config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if self.num_timesteps > config["learning_starts"] and (
                    self.num_timesteps % config["target_network_update_freq"]
                    == 0):
                # Update target network periodically.
                self.dqn_graph.update_target(self.sess)

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration":
            int(100 * self.exploration.value(self.num_timesteps))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("buffer_size", len(self.replay_buffer))
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular(
            "% time spent exploring",
            int(100 * self.exploration.value(self.num_timesteps)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
Beispiel #8
0
class Actor(object):
    def __init__(self, env_name, config, logdir):
        env = gym.make(env_name)
        # TODO(ekl): replace this with RLlib preprocessors
        if "NoFrameskip" in env_name:
            env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        self.config = config

        num_cpu = config["num_cpu"]
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_tp1, self.dqn_graph.q_t), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)

    def step(self, cur_timestep):
        # Take action and update exploration to the newest value
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(cur_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        return ret

    def do_steps(self, num_steps, cur_timestep):
        for _ in range(num_steps):
            obs, action, rew, new_obs, done = self.step(cur_timestep)
            self.replay_buffer.add(obs, action, rew, new_obs, done)

    def get_gradient(self, cur_timestep):
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["train_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch_idxes = None
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, obses_t, actions, rewards, obses_tp1, dones,
            np.ones_like(rewards))
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        return grad

    def apply_gradients(self, grad):
        self.dqn_graph.apply_gradients(self.sess, grad)

    def stats(self, num_timesteps):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        exploration = self.exploration.value(num_timesteps)
        return (mean_100ep_reward,
                mean_100ep_length, len(self.episode_rewards), exploration,
                len(self.replay_buffer))

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def save(self):
        return [
            self.beta_schedule, self.exploration, self.episode_rewards,
            self.episode_lengths, self.saved_mean_reward, self.obs
        ]

    def restore(self, data):
        self.beta_schedule = data[0]
        self.exploration = data[1]
        self.episode_rewards = data[2]
        self.episode_lengths = data[3]
        self.saved_mean_reward = data[4]
        self.obs = data[5]
Beispiel #9
0
Datei: dqn.py Projekt: xgong/ray
class DQN(Algorithm):
    def __init__(self, env_name, config, upload_dir=None):
        config.update({"alg": "DQN"})
        Algorithm.__init__(self, env_name, config, upload_dir=upload_dir)
        env = gym.make(env_name)
        env = ScaledFloatFrame(wrap_dqn(env))
        self.env = env
        model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                  hiddens=[256],
                                  dueling=True)
        sess = U.make_session(num_cpu=config["num_cpu"])
        sess.__enter__()

        def make_obs_ph(name):
            return U.BatchInput(env.observation_space.shape, name=name)

        self.act, self.optimize, self.update_target, self.debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10)
        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = (
                config["prioritized_replay_beta_iters"])
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = (
                    config["schedule_max_timesteps"])
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.num_timesteps = 0
        self.num_iterations = 0

    def train(self):
        config = self.config
        sample_time, learn_time = 0, 0

        for t in range(config["timesteps_per_iteration"]):
            self.num_timesteps += 1
            dt = time.time()
            # Take action and update exploration to the newest value
            action = self.act(np.array(self.obs)[None],
                              update_eps=self.exploration.value(t))[0]
            new_obs, rew, done, _ = self.env.step(action)
            # Store transition in the replay buffer.
            self.replay_buffer.add(self.obs, action, rew, new_obs, float(done))
            self.obs = new_obs

            self.episode_rewards[-1] += rew
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            sample_time += time.time() - dt

            if self.num_timesteps > config["learning_starts"] and \
                    self.num_timesteps % config["train_freq"] == 0:
                dt = time.time()
                # Minimize the error in Bellman's equation on a batch sampled
                # from replay buffer.
                if config["prioritized_replay"]:
                    experience = self.replay_buffer.sample(
                        config["batch_size"], beta=self.beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, _,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = \
                        self.replay_buffer.sample(config["batch_size"])
                    batch_idxes = None
                td_errors = self.optimize(obses_t, actions, rewards, obses_tp1,
                                          dones, np.ones_like(rewards))
                if config["prioritized_replay"]:
                    new_priorities = (np.abs(td_errors) +
                                      config["prioritized_replay_eps"])
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)
                learn_time += (time.time() - dt)

            if (self.num_timesteps > config["learning_starts"]
                    and self.num_timesteps %
                    config["target_network_update_freq"] == 0):
                # Update target network periodically.
                self.update_target()

        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1)
        num_episodes = len(self.episode_rewards)

        info = {
            "sample_time": sample_time,
            "learn_time": learn_time,
            "steps": self.num_timesteps,
            "episodes": num_episodes,
            "exploration": int(100 * self.exploration.value(t))
        }

        logger.record_tabular("sample_time", sample_time)
        logger.record_tabular("learn_time", learn_time)
        logger.record_tabular("steps", self.num_timesteps)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * self.exploration.value(t)))
        logger.dump_tabular()

        res = TrainingResult(self.experiment_id.hex, self.num_iterations,
                             mean_100ep_reward, mean_100ep_length, info)
        self.num_iterations += 1
        return res
Beispiel #10
0
Datei: dqn.py Projekt: zcli/ray
class Actor(object):
    def __init__(self, env_creator, config, logdir):
        env = env_creator()
        env = wrap_dqn(env, config["model"])
        self.env = env
        self.config = config

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(env, config, logdir)

        # Create the replay buffer
        if config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                config["buffer_size"],
                alpha=config["prioritized_replay_alpha"])
            prioritized_replay_beta_iters = \
                config["prioritized_replay_beta_iters"]
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = \
                    config["schedule_max_timesteps"]
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(config["buffer_size"])
            self.beta_schedule = None
        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.set_weights_time = RunningStat(())
        self.sample_time = RunningStat(())
        self.grad_time = RunningStat(())

        # Note that workers don't need target vars to be synced
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None
        self.obs = self.env.reset()
        self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph)

    def step(self, cur_timestep):
        """Takes a single step, and returns the result of the step."""
        action = self.dqn_graph.act(self.sess,
                                    np.array(self.obs)[None],
                                    self.exploration.value(cur_timestep))[0]
        new_obs, rew, done, _ = self.env.step(action)
        ret = (self.obs, action, rew, new_obs, float(done))
        self.obs = new_obs
        self.episode_rewards[-1] += rew
        self.episode_lengths[-1] += 1
        if done:
            self.obs = self.env.reset()
            self.episode_rewards.append(0.0)
            self.episode_lengths.append(0.0)
        return ret

    def do_steps(self, num_steps, cur_timestep, store):
        """Takes N steps.

        If store is True, the steps will be stored in the local replay buffer.
        Otherwise, the steps will be returned.
        """

        output = []
        for _ in range(num_steps):
            result = self.step(cur_timestep)
            if store:
                obs, action, rew, new_obs, done = result
                self.replay_buffer.add(obs, action, rew, new_obs, done)
            else:
                output.append(result)
        if not store:
            return output

    def do_multi_gpu_optimize(self, cur_timestep):
        """Performs N iters of multi-gpu SGD over the local replay buffer."""
        dt = time.time()
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["train_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["train_batch_size"])
            batch_idxes = None
        replay_buffer_read_time = (time.time() - dt)
        dt = time.time()
        tuples_per_device = self.dqn_graph.multi_gpu_optimizer.load_data(
            self.sess, [
                obses_t, actions, rewards, obses_tp1, dones,
                np.ones_like(rewards)
            ])
        per_device_batch_size = (
            self.dqn_graph.multi_gpu_optimizer.per_device_batch_size)
        num_batches = (int(tuples_per_device) // int(per_device_batch_size))
        data_load_time = (time.time() - dt)
        dt = time.time()
        for _ in range(self.config["num_sgd_iter"]):
            batches = list(range(num_batches))
            np.random.shuffle(batches)
            for i in batches:
                self.dqn_graph.multi_gpu_optimizer.optimize(
                    self.sess, i * per_device_batch_size)
        sgd_time = (time.time() - dt)
        dt = time.time()
        if self.config["prioritized_replay"]:
            dt = time.time()
            td_errors = self.dqn_graph.compute_td_error(
                self.sess, obses_t, actions, rewards, obses_tp1, dones,
                np.ones_like(rewards))
            dt = time.time()
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        prioritization_time = (time.time() - dt)
        return {
            "replay_buffer_read_time": replay_buffer_read_time,
            "data_load_time": data_load_time,
            "sgd_time": sgd_time,
            "prioritization_time": prioritization_time,
        }

    def do_async_step(self, worker_id, cur_timestep, params, gradient_id):
        """Takes steps and returns grad to apply async in the driver."""
        dt = time.time()
        self.set_weights(params)
        self.set_weights_time.push(time.time() - dt)
        dt = time.time()
        self.do_steps(self.config["sample_batch_size"],
                      cur_timestep,
                      store=True)
        self.sample_time.push(time.time() - dt)
        if (cur_timestep > self.config["learning_starts"]
                and len(self.replay_buffer) > self.config["train_batch_size"]):
            dt = time.time()
            gradient = self.sample_buffer_gradient(cur_timestep)
            self.grad_time.push(time.time() - dt)
        else:
            gradient = None
        return gradient, {"id": worker_id, "gradient_id": gradient_id}

    def sample_buffer_gradient(self, cur_timestep):
        """Returns grad over a batch sampled from the local replay buffer."""
        if self.config["prioritized_replay"]:
            experience = self.replay_buffer.sample(
                self.config["sgd_batch_size"],
                beta=self.beta_schedule.value(cur_timestep))
            (obses_t, actions, rewards, obses_tp1, dones, _,
             batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = \
                self.replay_buffer.sample(self.config["sgd_batch_size"])
            batch_idxes = None
        td_errors, grad = self.dqn_graph.compute_gradients(
            self.sess, obses_t, actions, rewards, obses_tp1, dones,
            np.ones_like(rewards))
        if self.config["prioritized_replay"]:
            new_priorities = (np.abs(td_errors) +
                              self.config["prioritized_replay_eps"])
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)
        return grad

    def apply_gradients(self, grad):
        self.dqn_graph.apply_gradients(self.sess, grad)

    # TODO(ekl) return a dictionary and use that everywhere to clean up the
    # bookkeeping of stats
    def stats(self, num_timesteps):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5)
        exploration = self.exploration.value(num_timesteps)
        return (mean_100ep_reward,
                mean_100ep_length, len(self.episode_rewards), exploration,
                len(self.replay_buffer), float(self.set_weights_time.mean),
                float(self.sample_time.mean), float(self.grad_time.mean))

    def get_weights(self):
        return self.variables.get_weights()

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def save(self):
        return [
            self.beta_schedule, self.exploration, self.episode_rewards,
            self.episode_lengths, self.saved_mean_reward, self.obs,
            self.replay_buffer
        ]

    def restore(self, data):
        self.beta_schedule = data[0]
        self.exploration = data[1]
        self.episode_rewards = data[2]
        self.episode_lengths = data[3]
        self.saved_mean_reward = data[4]
        self.obs = data[5]
        self.replay_buffer = data[6]