Ejemplo n.º 1
0
    def check_full_preprocessing(self):
        """
        Manual check of the full set of preprocessing steps.
        Not run as part of normal unit tests; run me with
          ./preprocessing_test.py TestPreprocessing.check_full_preprocessing
        """
        from pylab import subplot, imshow, show, tight_layout
        env = DummyEnv(dot_width=2, dot_height=2, draw_n_dots=True)
        env = NumberFrames(env)
        env_wrapped = generic_preprocess(env, max_n_noops=0)

        obs1 = env_wrapped.reset()
        obs2, _, _, _ = env_wrapped.step(0)
        obs3, _, _, _ = env_wrapped.step(0)
        obs4 = env_wrapped.reset()

        subplot(4, 1, 1)
        imshow(np.hstack(obs1), cmap='gray')
        subplot(4, 1, 2)
        imshow(np.hstack(obs2), cmap='gray')
        subplot(4, 1, 3)
        imshow(np.hstack(obs3), cmap='gray')
        subplot(4, 1, 4)
        imshow(np.hstack(obs4), cmap='gray')
        tight_layout()
        show()
Ejemplo n.º 2
0
 def test_full_preprocessing_rewards(self):
     """
     Check that rewards are summed correctly by Wrappers which operate
     over multiple timesteps.
     """
     env = DummyEnv()
     env_wrapped = generic_preprocess(env,
                                      max_n_noops=0,
                                      clip_rewards=False)
     env_wrapped.reset()
     _, r1, _, _ = env_wrapped.step(0)
     _, r2, _, _ = env_wrapped.step(0)
     _, r3, _, _ = env_wrapped.step(0)
     # MaxWrapper skips the first step after reset (which gives reward 2)
     # FrameStackWrapper does another 3 steps after reset, each of which
     # does 4 steps in the raw environment because of FrameSkipWrapper.
     # Step 1: 3, 4, 5, 6
     # Step 2: 7, 8, 9, 10
     # Step 3: 11, 12, 13, 14
     # The first step we do should get rewards 15, 16, 17 18, summed by
     # FrameSkipWrapper.
     self.assertEqual(r1, 66)
     # Then 19 + 20 + 21 + 22.
     self.assertEqual(r2, 82)
     # Then 23 + 24 + 25 + 27.
     self.assertEqual(r3, 98)
Ejemplo n.º 3
0
def main():
    args = parse_args()
    env = gym.make(args.env_id)
    env = generic_preprocess(env, max_n_noops=0)
    sess, obs_placeholder, action_probs_op = \
        get_network(args.ckpt_dir, env.observation_space.shape, env.action_space.n)
    run_agent(env, sess, obs_placeholder, action_probs_op)
Ejemplo n.º 4
0
    def test_rmsprop_variables(self):
        """
        Test 1: let's look at the variables the optimizer creates to check
        there's no funny business.
        """
        sess = tf.Session()
        env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0)

        optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                              decay=0.99,
                                              epsilon=1e-5)

        with tf.variable_scope('global'):
            make_inference_network(n_actions=env.action_space.n,
                                   weight_inits='glorot')

        network1 = Network(scope="worker_1",
                           n_actions=env.action_space.n,
                           entropy_bonus=0.01,
                           value_loss_coef=0.5,
                           weight_inits='glorot',
                           max_grad_norm=0.5,
                           optimizer=optimizer,
                           summaries=False,
                           debug=False)
        Worker(sess=sess, env=env, network=network1, log_dir='/tmp')

        vars1 = optimizer.variables()

        network2 = Network(scope="worker_2",
                           n_actions=env.action_space.n,
                           entropy_bonus=0.01,
                           value_loss_coef=0.5,
                           weight_inits='glorot',
                           max_grad_norm=0.5,
                           optimizer=optimizer,
                           summaries=False,
                           debug=False)
        Worker(sess=sess, env=env, network=network2, log_dir='/tmp')

        vars2 = optimizer.variables()

        self.assertNotEqual(id(vars1), id(vars2))

        # First, were any extra variables added when we created the second
        # optimizer, that might be indicative of a second set of statistics?
        self.assertLessEqual(vars1, vars2)
        # Second, are all the variables definitely associated with the global
        # set of parameters rather than the thead-local parameters?
        for v in vars1:
            self.assertIn('global', v.name)
Ejemplo n.º 5
0
def run_weight_test(reset_rmsprop):
    tf.reset_default_graph()
    utils.set_random_seeds(0)
    sess = tf.Session()
    env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0)
    env.seed(0)

    with tf.variable_scope('global'):
        make_inference_network(n_actions=env.action_space.n,
                               weight_inits='glorot')
    shared_variables = tf.global_variables()

    optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                          decay=0.99,
                                          epsilon=1e-5)

    network1 = Network(scope="worker_1",
                       n_actions=env.action_space.n,
                       entropy_bonus=0.01,
                       value_loss_coef=0.5,
                       weight_inits='glorot',
                       max_grad_norm=0.5,
                       optimizer=optimizer,
                       summaries=False,
                       debug=False)
    w1 = Worker(sess=sess, env=env, network=network1, log_dir='/tmp')

    network2 = Network(scope="worker_2",
                       n_actions=env.action_space.n,
                       entropy_bonus=0.01,
                       value_loss_coef=0.5,
                       weight_inits='glorot',
                       max_grad_norm=0.5,
                       optimizer=optimizer,
                       summaries=False,
                       debug=False)
    w2 = Worker(sess=sess, env=env, network=network2, log_dir='/tmp')

    rmsprop_init_ops = [v.initializer for v in optimizer.variables()]

    sess.run(tf.global_variables_initializer())

    vars_sum_init = sess.run(get_var_sum(shared_variables))
    w1.run_update(n_steps=1)
    vars_sum_post_w1_update = sess.run(get_var_sum(shared_variables))
    if reset_rmsprop:
        sess.run(rmsprop_init_ops)
    w2.run_update(n_steps=1)
    vars_sum_post_w2_update = sess.run(get_var_sum(shared_variables))

    return vars_sum_init, vars_sum_post_w1_update, vars_sum_post_w2_update
Ejemplo n.º 6
0
def run_agent(env_id, sess, network):
    env = gym.make(env_id)
    env = generic_preprocess(env)
    while True:
        obs = env.reset()
        episode_reward = 0
        done = False
        while not done:
            s = np.moveaxis(obs, 0, -1)
            feed_dict = {network.s: [s]}
            action_probs = sess.run(network.a_softmax, feed_dict)[0]
            action = np.random.choice(worker.ACTIONS, p=action_probs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            env.render()
            time.sleep(1/60.0)
        print("Episode reward:", episode_reward)