Beispiel #1
0
 def test_simple_importance_sampling(self):
     """
     Test importance sampling when the buffer is never
     changed after the initial build-up.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10,
                                   alpha=1.5,
                                   beta=1.3,
                                   epsilon=0.5)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample, init_weight=i)
     weights = np.power(np.arange(10).astype('float64') + 0.5, 1.5)
     weights /= np.sum(weights)
     weights = np.power(weights * len(weights), -1.3)
     weights /= np.max(weights)
     for i in range(1000):
         samples = buf.sample(3)
         for sample in samples:
             self.assertTrue(
                 np.allclose(weights[sample['idx']], sample['weight']))
Beispiel #2
0
 def test_uniform_sampling(self):
     """
     Test the buffer when it's configured to sample
     uniformly.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10, alpha=0, beta=1)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample)
     sampled_idxs = []
     for _ in range(10000):
         samples = buf.sample(3)
         sampled_idxs.extend([s['idx'] for s in samples])
         buf.update_weights(samples, [s['idx'] for s in samples])
     counts = Counter(sampled_idxs)
     for i in range(10):
         frac = counts[i] / len(sampled_idxs)
         self.assertGreater(frac, 0.09)
         self.assertLess(frac, 0.11)
Beispiel #3
0
 def test_prioritized_sampling(self):
     """
     Test the buffer in a simple prioritized setting.
     """
     np.random.seed(1337)
     buf = PrioritizedReplayBuffer(capacity=10,
                                   alpha=1.5,
                                   beta=1,
                                   epsilon=0.5)
     for i in range(10):
         sample = {
             'obs': 0,
             'action': 0,
             'reward': 0,
             'new_obs': 0,
             'steps': 1,
             'idx': i
         }
         buf.add_sample(sample, init_weight=i)
     sampled_idxs = []
     for i in range(50000):
         for sample in buf.sample(3):
             sampled_idxs.append(sample['idx'])
     counts = Counter(sampled_idxs)
     probs = np.power(np.arange(10).astype('float64') + 0.5, 1.5)
     probs /= np.sum(probs)
     for i, prob in enumerate(probs):
         frac = counts[i] / len(sampled_idxs)
         self.assertGreater(frac, prob - 0.01)
         self.assertLess(frac, prob + 0.01)
Beispiel #4
0
def main():

    env_name = 'MineRLNavigateDense-v0'
    """Run DQN until the environment throws an exception."""
    base_env = [SimpleNavigateEnvWrapper(get_env(env_name)) for _ in range(1)]
    env = BatchedFrameStack(BatchedGymEnv([base_env]),
                            num_images=4,
                            concat=True)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        online, target = mine_rainbow_online_target(mine_cnn,
                                                    sess,
                                                    env.action_space.n,
                                                    gym_space_vectorizer(
                                                        env.observation_space),
                                                    min_val=-200,
                                                    max_val=200)
        dqn = DQN(online, target)
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3)
        optimize = dqn.optimize(learning_rate=1e-4)
        sess.run(tf.global_variables_initializer())

        buffer_capacity = 5000

        replay_buffer = PrioritizedReplayBuffer(buffer_capacity,
                                                0.5,
                                                0.4,
                                                epsilon=0.1)

        iter = non_bugged_data_arr(env_name, num_trajs=100)
        expert_player = NStepPlayer(ImitationPlayer(iter, 200), 3)

        for traj in expert_player.play():
            replay_buffer.add_sample(traj, init_weight=1)

        print('starting training')
        dqn.train(num_steps=200,
                  player=player,
                  replay_buffer=replay_buffer,
                  optimize_op=optimize,
                  train_interval=1,
                  target_interval=8192,
                  batch_size=32,
                  min_buffer_size=20000)

        print('starting eval')
        player._cur_states = None
        score = evaluate(player)
        print(score)