Esempio n. 1
0
def add_episode(exp_buffer):

    episode = exp_buffer.create_episode()
    step1 = data.Step(np.random.rand(80, 80), 1, 0.0, False)
    step2 = data.Step(np.random.rand(80, 80), 0, 0.0, False)
    terminal = data.Step(np.random.rand(80, 80), 1, 0.0, True)
    episode.append(step1)
    episode.append(step2)
    episode.append(terminal)
    episode.end()
    return step1, step2, terminal
Esempio n. 2
0
def test_buffered():
    obs = np.array([0.0, 1.0, 2.0])
    action = 1
    reward = 0.0
    s = data.Step(obs, action, reward, False)
    r = data.Step(obs, action, 1.0, True)

    episode = [s, s, s, s, s, s, s, r]
    bds = data.BufferedRolloutDataset(discount_factor=0.99)

    for step in episode:
        bds.append(step.observation, step.action, step.reward, step.done)

    # for step in bds.rollouts[0]:
    #     print(step.advantage)

    assert abs(bds.rollouts[0].advantage - 0.932) <= 0.01
Esempio n. 3
0
def test_epi_batching(db):
    env_config = configs.BaseConfig(
        'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float)))
    rollout = db.create_rollout(env_config)
    episode = rollout.create_episode()

    for _ in range(3):

        o1 = np.random.rand(80, 80)
        episode.append(data.Step(o1, 1, 1.0, False), batch=10)
        o2 = np.random.rand(80, 80)
        episode.append(data.Step(o2, 1, 1.0, False), batch=10)
        o3 = np.random.rand(80, 80)
        episode.append(data.Step(o3, 1, 1.0, False), batch=10)

    assert len(episode) == 0
    episode.append(data.Step(o3, 1, 1.0, False), batch=10)
    assert len(episode) == 10

    episode = rollout.create_episode()
    for _ in range(3):

        o1 = np.random.rand(80, 80)
        episode.append(data.Step(o1, 1, 1.0, False), batch=10)
        o2 = np.random.rand(80, 80)
        episode.append(data.Step(o2, 1, 1.0, False), batch=10)
        o3 = np.random.rand(80, 80)
        episode.append(data.Step(o3, 1, 1.0, False), batch=10)

    episode.end()

    assert len(episode) == 9
Esempio n. 4
0
def test_redis_write_step(db):
    env_config = configs.BaseConfig(
        'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float)))
    rollout = db.create_rollout(env_config)
    lookup = {}

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.end()

    episode = rollout.create_episode()
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.end()

    episode = rollout.create_episode()
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.append(data.Step(o1, 1, 1.0, False))
    episode.end()

    rollout.finalize()

    step = rollout[0]
    step = rollout[1]
    step = rollout[2]

    np.testing.assert_array_equal(step.observation, o1)

    step = rollout[3]
    step = rollout[4]
    step = rollout[5]

    np.testing.assert_array_equal(step.observation, o1)

    step = rollout[6]
    step = rollout[7]
    step = rollout[8]

    np.testing.assert_array_equal(step.observation, o1)

    assert len(rollout) == 9
    assert step.reward == 1.0
    assert step.action == 1
Esempio n. 5
0
def test_step_iter(db):
    env_config = configs.BaseConfig(
        'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float)))
    rollout = db.create_rollout(env_config)

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 1, 1.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))
    episode.end()

    assert len(episode) == 3

    for step, o in zip(episode, [o1, o2, o3]):
        np.testing.assert_array_equal(step.observation, o)
Esempio n. 6
0
def test_encode_decode():
    o = np.random.rand(80, 80)
    s = data.Step(o, 1, 1.0, False)
    coder = data.StepCoder(observation_coder=data.NumpyCoder(2, np.float64))
    encoded = coder.encode(s)
    decoded = coder.decode(encoded)
    assert s.action == decoded.action
    assert s.reward == decoded.reward
    assert s.done == decoded.done
    np.testing.assert_array_equal(s.observation, decoded.observation)
Esempio n. 7
0
def populate():
    obs = np.array([0.0, 1.0, 2.0])
    action = 1
    reward = 0.0
    s = data.Step(obs, action, reward, False)
    r = data.Step(obs, action, 1.0, True)
    episode = [s, s, s, s, s, s, s, r]
    bds = data.BufferedRolloutDataset(discount_factor=0.99)
    for step in episode:
        bds.append(step.observation,
                   step.action,
                   step.reward,
                   step.done,
                   episode='player1')
        bds.append(step.observation,
                   step.action,
                   step.reward,
                   step.done,
                   episode='player2')
    return bds
Esempio n. 8
0
def test_epi_iter(db):
    env_config = configs.BaseConfig(
        'test', StepCoder(observation_coder=NumpyCoder(2, dtype=np.float64)))
    rollout = db.create_rollout(env_config)

    lookup = {}

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 1, 1.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))
    lookup[episode.id] = [o1, o2, o3]
    episode.end()

    episode = rollout.create_episode()
    o4 = np.random.rand(80, 80)
    episode.append(data.Step(o4, 1, 1.0, False))
    o5 = np.random.rand(80, 80)
    episode.append(data.Step(o5, 1, 1.0, False))
    o6 = np.random.rand(80, 80)
    episode.append(data.Step(o6, 1, 1.0, False))
    lookup[episode.id] = [o4, o5, o6]
    episode.end()

    episode = rollout.create_episode()
    o7 = np.random.rand(80, 80)
    episode.append(data.Step(o7, 1, 1.0, False))
    o8 = np.random.rand(80, 80)
    episode.append(data.Step(o8, 1, 1.0, False))
    o9 = np.random.rand(80, 80)
    episode.append(data.Step(o9, 1, 1.0, False))
    lookup[episode.id] = [o7, o8, o9]
    episode.end()

    rollout.finalize()

    assert len(rollout) == 9

    for episode in rollout:
        assert len(episode) == 3
        ob = lookup[episode.id]
        for step, o in zip(episode, ob):
            np.testing.assert_array_equal(step.observation, o)
Esempio n. 9
0
def test_epi_total_reward(db):
    env_config = configs.BaseConfig(
        'test', data.StepCoder(data.NumpyCoder(num_axes=2, dtype=np.float)))
    rollout = db.create_rollout(env_config)
    episode = rollout.create_episode()

    assert episode.total_reward() == 0

    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 1.0, False))

    assert episode.total_reward() == 1.0

    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 1, 1.0, False))

    assert episode.total_reward() == 2.0

    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))

    assert episode.total_reward() == 3.0
Esempio n. 10
0
def test_concurrency(db):
    config = configs.DiscreteConfig(gym_env_string='test',
                                    features=(80, 80),
                                    features_dtype=np.float64,
                                    action_map=[0, 1])

    rollout = db.create_rollout(config)
    obs = []

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    a = config.default_action
    episode.append(data.Step(o1, a, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, a, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, a, 1.0, False))
    # sentinel containing last observation
    episode.append(data.Step(o3, a, 0.0, True))
    obs.append([o1, o2, o3])
    episode.end()

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, a, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, a, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, a, 1.0, False))
    # sentinel containing last observation
    episode.append(data.Step(o3, a, 0.0, True))
    obs.append([o1, o2, o3])
    episode.end()

    rollout = db.latest_rollout(config)

    dataset = data.SARAdvantageDataset(
        rollout,
        discount_factor=0.99,
        state_transform=config.transform,
        action_transform=config.action_transform)

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, a, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, a, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, a, 1.0, False))
    # sentinel containing last observation
    episode.append(data.Step(o3, a, 0.0, True))
    obs.append([o1, o2, o3])
    episode.end()

    loader = DataLoader(dataset, batch_size=65, shuffle=True)

    for minibatch in loader:
        pass
Esempio n. 11
0
def test_advantage(db):
    config = configs.DiscreteConfig(gym_env_string='test',
                                    features=(80, 80),
                                    features_dtype=np.float64,
                                    action_map=[0, 1, 6])
    config.discount_factor = 0.99

    rollout = db.create_rollout(config)
    obs = []

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    a = config.default_action
    episode.append(data.Step(o1, a, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 6, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))
    # sentinel containing last observation
    episode.append(data.Step(o3, a, 0.0, True))
    obs.append([o1, o2, o3])
    episode.end()

    rollout_id = rollout.id

    rollout = db.latest_rollout(config)

    assert rollout.id == rollout_id

    dataset = data.SARAdvantageDataset(
        rollout,
        discount_factor=0.99,
        state_transform=config.transform,
        action_transform=config.action_transform)

    a = []
    a.append(1.0 * config.discount_factor**2)
    a.append(1.0 * config.discount_factor)
    a.append(1.0)

    mu = mean(a)
    sigma = stdev(a)

    adv = [(vl - mu) / (sigma + 1e-12) for vl in a]

    assert len(dataset) == 3

    observation, action, reward, advantage = dataset[0]
    assert reward == 0.00
    assert advantage == adv[0]
    assert action == config.default_action

    observation, action, reward, advantage = dataset[1]
    assert reward == 0.0
    assert advantage == adv[1]
    assert action.item() == 2

    observation, action, reward, advantage = dataset[2]
    assert reward == 1.0
    assert advantage == adv[2]
    assert action.item() == 1
Esempio n. 12
0
def add_dud_episode(exp_buffer):
    episode = exp_buffer.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 0, 0.0, True))
    episode.end()
Esempio n. 13
0
def test_advantage(db):
    env_config = configs.BaseConfig(
        'test_data',
        StepCoder(observation_coder=NumpyCoder(2, dtype=np.float64)))

    rollout = db.create_rollout(env_config)
    obs = []

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 1, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))
    obs.append([o1, o2, o3])
    episode.end()

    episode = rollout.create_episode()
    o1 = np.random.rand(80, 80)
    episode.append(data.Step(o1, 1, 0.0, False))
    o2 = np.random.rand(80, 80)
    episode.append(data.Step(o2, 1, 0.0, False))
    o3 = np.random.rand(80, 80)
    episode.append(data.Step(o3, 1, 1.0, False))
    obs.append([o1, o2, o3])
    episode.end()

    rollout.finalize()

    dataset = data.RolloutDatasetBase(env_config, rollout)

    a = []
    a.append(1.0 * env_config.discount_factor**2)
    a.append(1.0 * env_config.discount_factor)
    a.append(1.0)
    a.append(1.0 * env_config.discount_factor**2)
    a.append(1.0 * env_config.discount_factor)
    a.append(1.0)

    mu = mean(a)
    sigma = stdev(a)

    adv = [(vl - mu) / (sigma + 1e-12) for vl in a]

    observation, action, reward, advantage = dataset[0]
    assert reward == 0.00
    assert advantage == adv[0]

    observation, action, reward, advantage = dataset[1]
    assert reward == 0.0
    assert advantage == adv[1]

    observation, action, reward, advantage = dataset[2]
    assert reward == 1.0
    assert advantage == adv[2]

    observation, action, reward, advantage = dataset[3]
    assert reward == 0.00
    assert advantage == adv[3]

    observation, action, reward, advantage = dataset[4]
    assert reward == 0.0
    assert advantage == adv[4]

    observation, action, reward, advantage = dataset[5]
    assert reward == 1.0
    assert advantage == adv[5]