Beispiel #1
0
def test_nstep_returns():
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(obs=0, act=0, rew=i + 1, done=i % 4 == 3)
    batch, indice = buf.sample(0)
    assert np.allclose(indice, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [10, 11, 2, 3, 4, 5, 6, 7, 8, 9]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = BasePolicy.compute_nstep_return(batch,
                                              buf,
                                              indice,
                                              target_q_fn,
                                              gamma=.1,
                                              n_step=1).pop('returns')
    assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    # test nstep = 2
    returns = BasePolicy.compute_nstep_return(batch,
                                              buf,
                                              indice,
                                              target_q_fn,
                                              gamma=.1,
                                              n_step=2).pop('returns')
    assert np.allclose(returns,
                       [3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    # test nstep = 10
    returns = BasePolicy.compute_nstep_return(batch,
                                              buf,
                                              indice,
                                              target_q_fn,
                                              gamma=.1,
                                              n_step=10).pop('returns')
    assert np.allclose(returns,
                       [3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12])
Beispiel #2
0
 def optimized():
     return BasePolicy.compute_nstep_return(batch,
                                            buf,
                                            indice,
                                            target_q_fn,
                                            gamma=.1,
                                            n_step=3)
Beispiel #3
0
def test_nstep_returns(size=10000):
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(obs=0, act=0, rew=i + 1, done=i % 4 == 3)
    batch, indice = buf.sample(0)
    assert np.allclose(indice, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [10, 11, 2, 3, 4, 5, 6, 7, 8, 9]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=1).pop('returns'))
    assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    r_ = compute_nstep_return_base(1, .1, buf, indice)
    assert np.allclose(returns, r_), (r_, returns)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=1
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 2
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=2).pop('returns'))
    assert np.allclose(returns, [
        3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    r_ = compute_nstep_return_base(2, .1, buf, indice)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=2
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 10
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=10).pop('returns'))
    assert np.allclose(returns, [
        3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12])
    r_ = compute_nstep_return_base(10, .1, buf, indice)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=10
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])

    if __name__ == '__main__':
        buf = ReplayBuffer(size)
        for i in range(int(size * 1.5)):
            buf.add(obs=0, act=0, rew=i + 1, done=np.random.randint(3) == 0)
        batch, indice = buf.sample(256)

        def vanilla():
            return compute_nstep_return_base(3, .1, buf, indice)

        def optimized():
            return BasePolicy.compute_nstep_return(
                batch, buf, indice, target_q_fn, gamma=.1, n_step=3)

        cnt = 3000
        print('nstep vanilla', timeit(vanilla, setup=vanilla, number=cnt))
        print('nstep optim  ', timeit(optimized, setup=optimized, number=cnt))
Beispiel #4
0
def test_nstep_returns(size=10000):
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(Batch(obs=0, act=0, rew=i + 1, done=i % 4 == 3))
    batch, indices = buf.sample(0)
    assert np.allclose(indices, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [11, 12, 3, 4, 5, 6, 7, 8, 9, 10]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=1
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    r_ = compute_nstep_return_base(1, .1, buf, indices)
    assert np.allclose(returns, r_), (r_, returns)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=1
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 2
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=2
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    r_ = compute_nstep_return_base(2, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=2
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 10
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=10
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12])
    r_ = compute_nstep_return_base(10, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=10
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])