class DQN(Algorithm): def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) env = gym.make(env_name) # TODO(ekl): replace this with RLlib preprocessors if "NoFrameskip" in env_name: env = ScaledFloatFrame(wrap_dqn(env)) self.env = env num_cpu = config["num_cpu"] tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0 self.file_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) def train(self): config = self.config sample_time, learn_time = 0, 0 for _ in range(config["timesteps_per_iteration"]): self.num_timesteps += 1 dt = time.time() # Take action and update exploration to the newest value action = self.dqn_graph.act( self.sess, np.array(self.obs)[None], self.exploration.value(self.num_timesteps))[0] new_obs, rew, done, _ = self.env.step(action) # Store transition in the replay buffer. self.replay_buffer.add(self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) sample_time += time.time() - dt if self.num_timesteps > config["learning_starts"] and \ self.num_timesteps % config["train_freq"] == 0: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. if config["prioritized_replay"]: experience = self.replay_buffer.sample( config["batch_size"], beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = ( self.replay_buffer.sample(config["batch_size"])) batch_idxes = None td_errors = self.dqn_graph.train(self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if config["prioritized_replay"]: new_priorities = np.abs(td_errors) + ( config["prioritized_replay_eps"]) self.replay_buffer.update_priorities( batch_idxes, new_priorities) learn_time += (time.time() - dt) if self.num_timesteps > config["learning_starts"] and ( self.num_timesteps % config["target_network_update_freq"] == 0): # Update target network periodically. self.dqn_graph.update_target(self.sess) mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1) num_episodes = len(self.episode_rewards) info = { "sample_time": sample_time, "learn_time": learn_time, "steps": self.num_timesteps, "episodes": num_episodes, "exploration": int(100 * self.exploration.value(self.num_timesteps)) } logger.record_tabular("sample_time", sample_time) logger.record_tabular("learn_time", learn_time) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("buffer_size", len(self.replay_buffer)) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() res = TrainingResult(self.experiment_id.hex, self.num_iterations, mean_100ep_reward, mean_100ep_length, info) self.num_iterations += 1 return res
class DQNReplayEvaluator(DQNEvaluator): """Wraps DQNEvaluators to provide replay buffer functionality. This has two modes: If config["num_workers"] == 1: Samples will be collected locally. If config["num_workers"] > 1: Samples will be collected from a number of remote workers. """ def __init__(self, env_creator, config, logdir): DQNEvaluator.__init__(self, env_creator, config, logdir) # Create extra workers if needed if self.config["num_workers"] > 1: remote_cls = ray.remote(num_cpus=1)(DQNEvaluator) self.workers = [ remote_cls.remote(env_creator, config, logdir) for _ in range(self.config["num_workers"]) ] else: self.workers = [] # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None self.samples_to_prioritize = None def sample(self, no_replay=False): # First seed the replay buffer with a few new samples if self.workers: weights = ray.put(self.get_weights()) for w in self.workers: w.set_weights.remote(weights) samples = ray.get([w.sample.remote() for w in self.workers]) else: samples = [DQNEvaluator.sample(self)] for s in samples: for row in s.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"]) if no_replay: return samples # Then return a batch sampled from the buffer if self.config["prioritized_replay"]: (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.config["train_batch_size"], beta=self.beta_schedule.value(self.global_timestep)) self._update_priorities_if_needed() batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) self.samples_to_prioritize = batch else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(self.config["train_batch_size"]) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": np.ones_like(rewards) }) return batch def compute_gradients(self, samples): td_errors, grad = self.dqn_graph.compute_gradients( self.sess, samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) if self.config["prioritized_replay"]: new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) self.replay_buffer.update_priorities(samples["batch_indexes"], new_priorities) self.samples_to_prioritize = None return grad def _update_priorities_if_needed(self): """Manually updates replay buffer priorities on the last batch. Note that this is only needed when not computing gradients on this Evaluator (e.g. when using local multi-GPU). Otherwise, priorities can be updated more efficiently as part of computing gradients. """ if not self.samples_to_prioritize: return batch = self.samples_to_prioritize td_errors = self.dqn_graph.compute_td_error( self.sess, batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"], batch["dones"], batch["weights"]) new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) self.replay_buffer.update_priorities(batch["batch_indexes"], new_priorities) self.samples_to_prioritize = None def stats(self): if self.workers: return ray.get([s.stats.remote() for s in self.workers]) else: return DQNEvaluator.stats(self) def save(self): return [ DQNEvaluator.save(self), ray.get([w.save.remote() for w in self.workers]), self.beta_schedule, self.replay_buffer ] def restore(self, data): DQNEvaluator.restore(self, data[0]) for (w, d) in zip(self.workers, data[1]): w.restore.remote(d) self.beta_schedule = data[2] self.replay_buffer = data[3]
class DQN(Algorithm): def __init__(self, env_name, config, upload_dir=None): config.update({"alg": "DQN"}) Algorithm.__init__(self, env_name, config, upload_dir=upload_dir) env = gym.make(env_name) env = ScaledFloatFrame(wrap_dqn(env)) self.env = env model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) sess = U.make_session(num_cpu=config["num_cpu"]) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) self.act, self.optimize, self.update_target, self.debug = build_train( make_obs_ph=make_obs_ph, q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = ( config["prioritized_replay_beta_iters"]) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = ( config["schedule_max_timesteps"]) self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.num_timesteps = 0 self.num_iterations = 0 def train(self): config = self.config sample_time, learn_time = 0, 0 for t in range(config["timesteps_per_iteration"]): self.num_timesteps += 1 dt = time.time() # Take action and update exploration to the newest value action = self.act(np.array(self.obs)[None], update_eps=self.exploration.value(t))[0] new_obs, rew, done, _ = self.env.step(action) # Store transition in the replay buffer. self.replay_buffer.add(self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) sample_time += time.time() - dt if self.num_timesteps > config["learning_starts"] and \ self.num_timesteps % config["train_freq"] == 0: dt = time.time() # Minimize the error in Bellman's equation on a batch sampled # from replay buffer. if config["prioritized_replay"]: experience = self.replay_buffer.sample( config["batch_size"], beta=self.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(config["batch_size"]) batch_idxes = None td_errors = self.optimize(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if config["prioritized_replay"]: new_priorities = (np.abs(td_errors) + config["prioritized_replay_eps"]) self.replay_buffer.update_priorities( batch_idxes, new_priorities) learn_time += (time.time() - dt) if (self.num_timesteps > config["learning_starts"] and self.num_timesteps % config["target_network_update_freq"] == 0): # Update target network periodically. self.update_target() mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1) num_episodes = len(self.episode_rewards) info = { "sample_time": sample_time, "learn_time": learn_time, "steps": self.num_timesteps, "episodes": num_episodes, "exploration": int(100 * self.exploration.value(t)) } logger.record_tabular("sample_time", sample_time) logger.record_tabular("learn_time", learn_time) logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(t))) logger.dump_tabular() res = TrainingResult(self.experiment_id.hex, self.num_iterations, mean_100ep_reward, mean_100ep_length, info) self.num_iterations += 1 return res
class Actor(object): def __init__(self, env_name, config, logdir): env = gym.make(env_name) # TODO(ekl): replace this with RLlib preprocessors if "NoFrameskip" in env_name: env = ScaledFloatFrame(wrap_dqn(env)) self.env = env self.config = config num_cpu = config["num_cpu"] tf_config = tf.ConfigProto(inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_tp1, self.dqn_graph.q_t), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph) def step(self, cur_timestep): # Take action and update exploration to the newest value action = self.dqn_graph.act(self.sess, np.array(self.obs)[None], self.exploration.value(cur_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) return ret def do_steps(self, num_steps, cur_timestep): for _ in range(num_steps): obs, action, rew, new_obs, done = self.step(cur_timestep) self.replay_buffer.add(obs, action, rew, new_obs, done) def get_gradient(self, cur_timestep): if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["train_batch_size"], beta=self.beta_schedule.value(cur_timestep)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(self.config["train_batch_size"]) batch_idxes = None td_errors, grad = self.dqn_graph.compute_gradients( self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if self.config["prioritized_replay"]: new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) self.replay_buffer.update_priorities(batch_idxes, new_priorities) return grad def apply_gradients(self, grad): self.dqn_graph.apply_gradients(self.sess, grad) def stats(self, num_timesteps): mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 1) exploration = self.exploration.value(num_timesteps) return (mean_100ep_reward, mean_100ep_length, len(self.episode_rewards), exploration, len(self.replay_buffer)) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def save(self): return [ self.beta_schedule, self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs ] def restore(self, data): self.beta_schedule = data[0] self.exploration = data[1] self.episode_rewards = data[2] self.episode_lengths = data[3] self.saved_mean_reward = data[4] self.obs = data[5]
class DQNEvaluator(TFMultiGPUSupport): """The base DQN Evaluator that does not include the replay buffer. TODO(rliaw): Support observation/reward filters?""" def __init__(self, registry, env_creator, config, logdir): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() def set_global_timestep(self, global_timestep): self.global_timestep = global_timestep def update_target(self): self.dqn_graph.update_target(self.sess) def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep(self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards) }) assert batch.count == self.config["sample_batch_size"] return batch def compute_gradients(self, samples): _, grad = self.dqn_graph.compute_gradients( self.sess, samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) return grad def apply_gradients(self, grads): self.dqn_graph.apply_gradients(self.sess, grads) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def tf_loss_inputs(self): return self.dqn_graph.loss_inputs def build_tf_loss(self, input_placeholders): return self.dqn_graph.build_loss(*input_placeholders) def _step(self, global_timestep): """Takes a single step, and returns the result of the step.""" action = self.dqn_graph.act(self.sess, np.array(self.obs)[None], self.exploration.value(global_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 return ret def stats(self): mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5) exploration = self.exploration.value(self.global_timestep) return { "mean_100ep_reward": mean_100ep_reward, "mean_100ep_length": mean_100ep_length, "num_episodes": len(self.episode_rewards), "exploration": exploration, "local_timestep": self.local_timestep, } def save(self): return [ self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs ] def restore(self, data): self.exploration = data[0] self.episode_rewards = data[1] self.episode_lengths = data[2] self.saved_mean_reward = data[3] self.obs = data[4]
class DQNEvaluator(PolicyEvaluator): """The DQN Evaluator. TODO(rliaw): Support observation/reward filters?""" def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( 0.4 ** ( 1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int( config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() def set_global_timestep(self, global_timestep): self.global_timestep = global_timestep def update_target(self): self.dqn_graph.update_target(self.sess) def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards)}) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.dqn_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch def compute_gradients(self, samples): td_err, grads = self.dqn_graph.compute_gradients( self.sess, samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) return grads, {"td_error": td_err} def apply_gradients(self, grads): self.dqn_graph.apply_gradients(self.sess, grads) def compute_apply(self, samples): td_error = self.dqn_graph.compute_apply( self.sess, samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) return {"td_error": td_error} def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def _step(self, global_timestep): """Takes a single step, and returns the result of the step.""" action = self.dqn_graph.act( self.sess, np.array(self.obs)[None], self.exploration.value(global_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 return ret def stats(self): n = self.config["smoothing_num_episodes"] + 1 mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5) exploration = self.exploration.value(self.global_timestep) return { "mean_100ep_reward": mean_100ep_reward, "mean_100ep_length": mean_100ep_length, "num_episodes": len(self.episode_rewards), "exploration": exploration, "local_timestep": self.local_timestep, } def save(self): return [ self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs, self.global_timestep, self.local_timestep] def restore(self, data): self.exploration = data[0] self.episode_rewards = data[1] self.episode_lengths = data[2] self.saved_mean_reward = data[3] self.obs = data[4] self.global_timestep = data[5] self.local_timestep = data[6]
class Actor(object): def __init__(self, env_creator, config, logdir): env = env_creator() env = wrap_dqn(env, config["model"]) self.env = env self.config = config tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config, logdir) # Create the replay buffer if config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( config["buffer_size"], alpha=config["prioritized_replay_alpha"]) prioritized_replay_beta_iters = \ config["prioritized_replay_beta_iters"] if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = \ config["schedule_max_timesteps"] self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(config["buffer_size"]) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.set_weights_time = RunningStat(()) self.sample_time = RunningStat(()) self.grad_time = RunningStat(()) # Note that workers don't need target vars to be synced self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() self.file_writer = tf.summary.FileWriter(logdir, self.sess.graph) def step(self, cur_timestep): """Takes a single step, and returns the result of the step.""" action = self.dqn_graph.act(self.sess, np.array(self.obs)[None], self.exploration.value(cur_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) return ret def do_steps(self, num_steps, cur_timestep, store): """Takes N steps. If store is True, the steps will be stored in the local replay buffer. Otherwise, the steps will be returned. """ output = [] for _ in range(num_steps): result = self.step(cur_timestep) if store: obs, action, rew, new_obs, done = result self.replay_buffer.add(obs, action, rew, new_obs, done) else: output.append(result) if not store: return output def do_multi_gpu_optimize(self, cur_timestep): """Performs N iters of multi-gpu SGD over the local replay buffer.""" dt = time.time() if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["train_batch_size"], beta=self.beta_schedule.value(cur_timestep)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(self.config["train_batch_size"]) batch_idxes = None replay_buffer_read_time = (time.time() - dt) dt = time.time() tuples_per_device = self.dqn_graph.multi_gpu_optimizer.load_data( self.sess, [ obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards) ]) per_device_batch_size = ( self.dqn_graph.multi_gpu_optimizer.per_device_batch_size) num_batches = (int(tuples_per_device) // int(per_device_batch_size)) data_load_time = (time.time() - dt) dt = time.time() for _ in range(self.config["num_sgd_iter"]): batches = list(range(num_batches)) np.random.shuffle(batches) for i in batches: self.dqn_graph.multi_gpu_optimizer.optimize( self.sess, i * per_device_batch_size) sgd_time = (time.time() - dt) dt = time.time() if self.config["prioritized_replay"]: dt = time.time() td_errors = self.dqn_graph.compute_td_error( self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) dt = time.time() new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) self.replay_buffer.update_priorities(batch_idxes, new_priorities) prioritization_time = (time.time() - dt) return { "replay_buffer_read_time": replay_buffer_read_time, "data_load_time": data_load_time, "sgd_time": sgd_time, "prioritization_time": prioritization_time, } def do_async_step(self, worker_id, cur_timestep, params, gradient_id): """Takes steps and returns grad to apply async in the driver.""" dt = time.time() self.set_weights(params) self.set_weights_time.push(time.time() - dt) dt = time.time() self.do_steps(self.config["sample_batch_size"], cur_timestep, store=True) self.sample_time.push(time.time() - dt) if (cur_timestep > self.config["learning_starts"] and len(self.replay_buffer) > self.config["train_batch_size"]): dt = time.time() gradient = self.sample_buffer_gradient(cur_timestep) self.grad_time.push(time.time() - dt) else: gradient = None return gradient, {"id": worker_id, "gradient_id": gradient_id} def sample_buffer_gradient(self, cur_timestep): """Returns grad over a batch sampled from the local replay buffer.""" if self.config["prioritized_replay"]: experience = self.replay_buffer.sample( self.config["sgd_batch_size"], beta=self.beta_schedule.value(cur_timestep)) (obses_t, actions, rewards, obses_tp1, dones, _, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ self.replay_buffer.sample(self.config["sgd_batch_size"]) batch_idxes = None td_errors, grad = self.dqn_graph.compute_gradients( self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if self.config["prioritized_replay"]: new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) self.replay_buffer.update_priorities(batch_idxes, new_priorities) return grad def apply_gradients(self, grad): self.dqn_graph.apply_gradients(self.sess, grad) # TODO(ekl) return a dictionary and use that everywhere to clean up the # bookkeeping of stats def stats(self, num_timesteps): mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5) exploration = self.exploration.value(num_timesteps) return (mean_100ep_reward, mean_100ep_length, len(self.episode_rewards), exploration, len(self.replay_buffer), float(self.set_weights_time.mean), float(self.sample_time.mean), float(self.grad_time.mean)) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def save(self): return [ self.beta_schedule, self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs, self.replay_buffer ] def restore(self, data): self.beta_schedule = data[0] self.exploration = data[1] self.episode_rewards = data[2] self.episode_lengths = data[3] self.saved_mean_reward = data[4] self.obs = data[5] self.replay_buffer = data[6]
class DQNEvaluator(Evaluator): """The DQN Evaluator. TODO(rliaw): Support observation/reward filters?""" def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( 0.4 ** ( 1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int( config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() def set_global_timestep(self, global_timestep): self.global_timestep = global_timestep def update_target(self): self.dqn_graph.update_target(self.sess) def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards)}) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.dqn_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch def compute_apply(self, samples): if samples is None: return None td_error = self.dqn_graph.compute_apply( self.sess, samples["obs"], samples["actions"], samples["rewards"], samples["new_obs"], samples["dones"], samples["weights"]) return td_error def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def _step(self, global_timestep): """Takes a single step, and returns the result of the step.""" action = self.dqn_graph.act( self.sess, np.array(self.obs)[None], self.exploration.value(global_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 return ret def stats(self): n = self.config["smoothing_num_episodes"] + 1 mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5) exploration = self.exploration.value(self.global_timestep) return { "mean_100ep_reward": mean_100ep_reward, "mean_100ep_length": mean_100ep_length, "num_episodes": len(self.episode_rewards), "exploration": exploration, "local_timestep": self.local_timestep, } def save(self): return [ self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs, self.global_timestep, self.local_timestep] def restore(self, data): self.exploration = data[0] self.episode_rewards = data[1] self.episode_lengths = data[2] self.saved_mean_reward = data[3] self.obs = data[4] self.global_timestep = data[5] self.local_timestep = data[6]
class DQNEvaluator(TFMultiGPUSupport): """The base DQN Evaluator that does not include the replay buffer.""" def __init__(self, env_creator, config, logdir): env = env_creator() env = wrap_dqn(env, config["model"]) self.env = env self.config = config tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(env, config, logdir) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset() def set_global_timestep(self, global_timestep): self.global_timestep = global_timestep def update_target(self): self.dqn_graph.update_target(self.sess) def sample(self): output = [] for _ in range(self.config["sample_batch_size"]): result = self._step(self.global_timestep) output.append(result) return output def compute_gradients(self, samples): if self.config["prioritized_replay"]: obses_t, actions, rewards, obses_tp1, dones, _ = samples else: obses_t, actions, rewards, obses_tp1, dones = samples _, grad = self.dqn_graph.compute_gradients(self.sess, obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) return grad def apply_gradients(self, grads): self.dqn_graph.apply_gradients(self.sess, grads) def get_weights(self): return self.variables.get_weights() def set_weights(self, weights): self.variables.set_weights(weights) def tf_loss_inputs(self): return self.dqn_graph.loss_inputs def build_tf_loss(self, input_placeholders): return self.dqn_graph.build_loss(*input_placeholders) def _step(self, global_timestep): """Takes a single step, and returns the result of the step.""" action = self.dqn_graph.act(self.sess, np.array(self.obs)[None], self.exploration.value(global_timestep))[0] new_obs, rew, done, _ = self.env.step(action) ret = (self.obs, action, rew, new_obs, float(done)) self.obs = new_obs self.episode_rewards[-1] += rew self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 return ret def stats(self): mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5) mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5) exploration = self.exploration.value(self.global_timestep) return { "mean_100ep_reward": mean_100ep_reward, "mean_100ep_length": mean_100ep_length, "num_episodes": len(self.episode_rewards), "exploration": exploration, "local_timestep": self.local_timestep, } def save(self): return [ self.exploration, self.episode_rewards, self.episode_lengths, self.saved_mean_reward, self.obs ] def restore(self, data): self.exploration = data[0] self.episode_rewards = data[1] self.episode_lengths = data[2] self.saved_mean_reward = data[3] self.obs = data[4]