Example #1
0
    def check_full_preprocessing(self):
        """
        Manual check of the full set of preprocessing steps.
        Not run as part of normal unit tests; run me with
          ./preprocessing_test.py TestPreprocessing.check_full_preprocessing
        """
        from pylab import subplot, imshow, show, tight_layout
        env = DummyEnv(dot_width=2, dot_height=2, draw_n_dots=True)
        env = NumberFrames(env)
        env_wrapped = generic_preprocess(env, max_n_noops=0)

        obs1 = env_wrapped.reset()
        obs2, _, _, _ = env_wrapped.step(0)
        obs3, _, _, _ = env_wrapped.step(0)
        obs4 = env_wrapped.reset()

        subplot(4, 1, 1)
        imshow(np.hstack(obs1), cmap='gray')
        subplot(4, 1, 2)
        imshow(np.hstack(obs2), cmap='gray')
        subplot(4, 1, 3)
        imshow(np.hstack(obs3), cmap='gray')
        subplot(4, 1, 4)
        imshow(np.hstack(obs4), cmap='gray')
        tight_layout()
        show()
Example #2
0
 def play_pong(self, wrap_fn):
     """
     Manual check of full set of preprocessing steps for Pong.
     Not run as port of normal unit tests; run me with
       ./preprocessing_test.py TestPreprocessing.play_pong_generic_wrap
       ./preprocessing_test.py TestPreprocessing.play_pong_special_wrap
     """
     from gym.utils import play as gym_play
     env = gym.make('PongNoFrameskip-v4')
     env = NumberFrames(env)
     env = wrap_fn(env, max_n_noops=0)
     env = ConcatFrameStack(env)
     gym_play.play(env, fps=15, zoom=4)
Example #3
0
        def thunk():
            env = gym.make(env_id)
            # We calculate the env seed like this so that changing the
            # global seed completely changes the whole set of env seeds.
            env_seed = seed * n_envs + env_n
            env.seed(env_seed)
            if debug:
                env = NumberFrames(env)
            env = preprocess_wrapper(env, max_n_noops)

            if env_n == 0:
                env_log_dir = osp.join(log_dir, "env_{}".format(env_n))
            else:
                env_log_dir = None
            env = MonitorEnv(env, "Env {}".format(env_n), log_dir=env_log_dir)

            return env
Example #4
0
    def __init__(self, sess, env_id, preprocess_wrapper, worker_n, seed,
                 log_dir, max_n_noops, debug):
        env = gym.make(env_id)
        env.seed(seed)
        if debug:
            env = NumberFrames(env)
        self.env = preprocess_wrapper(env, max_n_noops)

        self.sess = sess

        worker_scope = "worker_%d" % worker_n
        self.worker_n = worker_n
        self.network = create_network(worker_scope, debug)
        self.summary_writer = tf.summary.FileWriter(log_dir, flush_secs=1)
        self.scope = worker_scope

        # From the paper, Section 4, Asynchronous RL Framework,
        # subsection Optimization:
        # "We investigated three different optimization algorithms in our
        #  asynchronous framework – SGD with momentum, RMSProp without shared
        #  statistics, and RMSProp with shared statistics.
        #  We used the standard non-centered RMSProp update..."
        # "A comparison on a subset of Atari 2600 games showed that a variant
        #  of RMSProp where statistics g are shared across threads is
        #  considerably more robust than the other two methods."
        #
        # TensorFlow's RMSPropOptimizer defaults to centered=False,
        # so we're good there. For shared statistics - RMSPropOptimizer's
        # gradient statistics variables are associated with the variables
        # supplied to apply_gradients(), which happen to be in the global scope
        # (see train_ops.py). So we get shared statistics without any special
        # effort.
        #
        # In terms of hyperparameters:
        #
        # Learning rate: the paper actually runs a bunch of
        # different learning rates and presents results averaged over the
        # three best learning rates for each game. From the scatter plot of
        # performance for different learning rates, Figure 2, it looks like
        # 7e-4 is a safe bet which works across a variety of games.
        # TODO: 7e-4
        #
        # RMSprop hyperparameters: Section 8, Experimental Setup, says:
        # "All experiments used...RMSProp decay factor of α = 0.99."
        # There's no mention of the epsilon used. I see that OpenAI's
        # baselines implementation of A2C uses 1e-5 (https://git.io/vpCQt),
        # instead of TensorFlow's default of 1e-10. Remember, RMSprop divides
        # gradients by a factor based on recent gradient history. Epsilon is
        # added to that factor to prevent a division by zero. If epsilon is
        # too small, we'll get a very large update when the gradient history is
        # close to zero. So my speculation about why baselines uses a much
        # larger epsilon is: sometimes in RL the gradients can end up being
        # very small, and we want to limit the size of the update.
        policy_optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                                     decay=0.99,
                                                     epsilon=1e-5)
        value_optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                                    decay=0.99,
                                                    epsilon=1e-5)

        self.update_policy_gradients, self.apply_policy_gradients, \
        self.zero_policy_gradients, self.grad_bufs_policy, \
        grads_policy_norm = \
            create_train_ops(self.network.policy_loss,
                             policy_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')

        self.update_value_gradients, self.apply_value_gradients, \
        self.zero_value_gradients, self.grad_bufs_value, \
        grads_value_norm = \
            create_train_ops(self.network.value_loss,
                             value_optimizer,
                             update_scope=worker_scope,
                             apply_scope='global')

        utils.add_rmsprop_monitoring_ops(policy_optimizer, 'policy')
        utils.add_rmsprop_monitoring_ops(value_optimizer, 'value')

        tf.summary.scalar('rl/value_loss', self.network.value_loss)
        tf.summary.scalar('rl/policy_entropy',
                          tf.reduce_mean(self.network.policy_entropy))
        tf.summary.scalar('gradients/norm_policy', grads_policy_norm)
        tf.summary.scalar('gradients/norm_value', grads_value_norm)
        self.summary_ops = tf.summary.merge_all()

        self.copy_ops = utils.create_copy_ops(from_scope='global',
                                              to_scope=self.scope)

        self.steps = 0
        self.episode_rewards = []
        self.render = False
        self.episode_n = 1
        self.max_n_noops = max_n_noops

        self.value_log = deque(maxlen=100)
        self.fig = None

        self.last_o = self.env.reset()