def check_full_preprocessing(self): """ Manual check of the full set of preprocessing steps. Not run as part of normal unit tests; run me with ./preprocessing_test.py TestPreprocessing.check_full_preprocessing """ from pylab import subplot, imshow, show, tight_layout env = DummyEnv(dot_width=2, dot_height=2, draw_n_dots=True) env = NumberFrames(env) env_wrapped = generic_preprocess(env, max_n_noops=0) obs1 = env_wrapped.reset() obs2, _, _, _ = env_wrapped.step(0) obs3, _, _, _ = env_wrapped.step(0) obs4 = env_wrapped.reset() subplot(4, 1, 1) imshow(np.hstack(obs1), cmap='gray') subplot(4, 1, 2) imshow(np.hstack(obs2), cmap='gray') subplot(4, 1, 3) imshow(np.hstack(obs3), cmap='gray') subplot(4, 1, 4) imshow(np.hstack(obs4), cmap='gray') tight_layout() show()
def play_pong(self, wrap_fn): """ Manual check of full set of preprocessing steps for Pong. Not run as port of normal unit tests; run me with ./preprocessing_test.py TestPreprocessing.play_pong_generic_wrap ./preprocessing_test.py TestPreprocessing.play_pong_special_wrap """ from gym.utils import play as gym_play env = gym.make('PongNoFrameskip-v4') env = NumberFrames(env) env = wrap_fn(env, max_n_noops=0) env = ConcatFrameStack(env) gym_play.play(env, fps=15, zoom=4)
def thunk(): env = gym.make(env_id) # We calculate the env seed like this so that changing the # global seed completely changes the whole set of env seeds. env_seed = seed * n_envs + env_n env.seed(env_seed) if debug: env = NumberFrames(env) env = preprocess_wrapper(env, max_n_noops) if env_n == 0: env_log_dir = osp.join(log_dir, "env_{}".format(env_n)) else: env_log_dir = None env = MonitorEnv(env, "Env {}".format(env_n), log_dir=env_log_dir) return env
def __init__(self, sess, env_id, preprocess_wrapper, worker_n, seed, log_dir, max_n_noops, debug): env = gym.make(env_id) env.seed(seed) if debug: env = NumberFrames(env) self.env = preprocess_wrapper(env, max_n_noops) self.sess = sess worker_scope = "worker_%d" % worker_n self.worker_n = worker_n self.network = create_network(worker_scope, debug) self.summary_writer = tf.summary.FileWriter(log_dir, flush_secs=1) self.scope = worker_scope # From the paper, Section 4, Asynchronous RL Framework, # subsection Optimization: # "We investigated three different optimization algorithms in our # asynchronous framework – SGD with momentum, RMSProp without shared # statistics, and RMSProp with shared statistics. # We used the standard non-centered RMSProp update..." # "A comparison on a subset of Atari 2600 games showed that a variant # of RMSProp where statistics g are shared across threads is # considerably more robust than the other two methods." # # TensorFlow's RMSPropOptimizer defaults to centered=False, # so we're good there. For shared statistics - RMSPropOptimizer's # gradient statistics variables are associated with the variables # supplied to apply_gradients(), which happen to be in the global scope # (see train_ops.py). So we get shared statistics without any special # effort. # # In terms of hyperparameters: # # Learning rate: the paper actually runs a bunch of # different learning rates and presents results averaged over the # three best learning rates for each game. From the scatter plot of # performance for different learning rates, Figure 2, it looks like # 7e-4 is a safe bet which works across a variety of games. # TODO: 7e-4 # # RMSprop hyperparameters: Section 8, Experimental Setup, says: # "All experiments used...RMSProp decay factor of α = 0.99." # There's no mention of the epsilon used. I see that OpenAI's # baselines implementation of A2C uses 1e-5 (https://git.io/vpCQt), # instead of TensorFlow's default of 1e-10. Remember, RMSprop divides # gradients by a factor based on recent gradient history. Epsilon is # added to that factor to prevent a division by zero. If epsilon is # too small, we'll get a very large update when the gradient history is # close to zero. So my speculation about why baselines uses a much # larger epsilon is: sometimes in RL the gradients can end up being # very small, and we want to limit the size of the update. policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) value_optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) self.update_policy_gradients, self.apply_policy_gradients, \ self.zero_policy_gradients, self.grad_bufs_policy, \ grads_policy_norm = \ create_train_ops(self.network.policy_loss, policy_optimizer, update_scope=worker_scope, apply_scope='global') self.update_value_gradients, self.apply_value_gradients, \ self.zero_value_gradients, self.grad_bufs_value, \ grads_value_norm = \ create_train_ops(self.network.value_loss, value_optimizer, update_scope=worker_scope, apply_scope='global') utils.add_rmsprop_monitoring_ops(policy_optimizer, 'policy') utils.add_rmsprop_monitoring_ops(value_optimizer, 'value') tf.summary.scalar('rl/value_loss', self.network.value_loss) tf.summary.scalar('rl/policy_entropy', tf.reduce_mean(self.network.policy_entropy)) tf.summary.scalar('gradients/norm_policy', grads_policy_norm) tf.summary.scalar('gradients/norm_value', grads_value_norm) self.summary_ops = tf.summary.merge_all() self.copy_ops = utils.create_copy_ops(from_scope='global', to_scope=self.scope) self.steps = 0 self.episode_rewards = [] self.render = False self.episode_n = 1 self.max_n_noops = max_n_noops self.value_log = deque(maxlen=100) self.fig = None self.last_o = self.env.reset()