Example #1
0
def test_replay_buffer_with_episode(maxlen, data_size):
    env = gym.make("CartPole-v0")

    observation_shape = env.observation_space.shape
    action_size = env.action_space.n

    observations = np.random.random((data_size, *observation_shape))
    actions = np.random.randint(action_size, size=data_size, dtype=np.int32)
    rewards = np.random.random(data_size)

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations.astype("f4"),
        actions=actions,
        rewards=rewards.astype("f4"),
    )

    buffer = ReplayBuffer(maxlen, env, episodes=[episode])

    # check episode initialization
    assert len(buffer) == data_size - 1

    # check append_episode
    buffer.append_episode(episode)
    assert len(buffer) == 2 * (data_size - 1)
Example #2
0
def test_initial_state_value_estimation_scorer(observation_shape, action_size,
                                               n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)

    total_values = []
    for episode in episodes:
        observation = episode.observations[0].reshape(1, -1)
        policy_actions = algo.predict(observation)
        values = algo.predict_value(observation, policy_actions)
        total_values.append(values)

    score = initial_state_value_estimation_scorer(algo, episodes)
    assert np.allclose(score, np.mean(total_values))
Example #3
0
def test_compare_continuous_action_diff(observation_shape, action_size,
                                        n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    A1 = np.random.random(observation_shape + (action_size, ))
    A2 = np.random.random(observation_shape + (action_size, ))
    algo = DummyAlgo(A1, 0.0)
    base_algo = DummyAlgo(A2, 0.0)

    total_diffs = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        actions = algo.predict(batch.observations)
        base_actions = base_algo.predict(batch.observations)
        diff = ((actions - base_actions)**2).sum(axis=1).tolist()
        total_diffs += diff

    score = compare_continuous_action_diff(base_algo)(algo, episodes)
    assert np.allclose(score, -np.mean(total_diffs))
Example #4
0
def test_soft_opc_scorer(observation_shape, action_size, n_episodes,
                         episode_length, threshold):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)
    success_values = []
    all_values = []
    for episode in episodes:
        is_success = episode.compute_return() >= threshold
        batch = TransitionMiniBatch(episode.transitions)
        values = algo.predict_value(batch.observations, batch.actions)
        if is_success:
            success_values += values.tolist()
        all_values += values.tolist()

    scorer = soft_opc_scorer(threshold)
    score = scorer(algo, episodes)
    assert np.allclose(score, np.mean(success_values) - np.mean(all_values))
Example #5
0
def test_compare_discrete_action_diff(observation_shape, action_size,
                                      n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    A1 = np.random.random(observation_shape + (action_size, ))
    A2 = np.random.random(observation_shape + (action_size, ))
    algo = DummyAlgo(A1, 0.0, discrete=True)
    base_algo = DummyAlgo(A2, 0.0, discrete=True)

    total_matches = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        actions = algo.predict(batch.observations)
        base_actions = base_algo.predict(batch.observations)
        match = (actions == base_actions).tolist()
        total_matches += match

    score = compare_discrete_action_match(base_algo)(algo, episodes)
    assert np.allclose(score, np.mean(total_matches))
Example #6
0
def test_discounted_sum_of_advantage_scorer(observation_shape, action_size,
                                            n_episodes, episode_length, gamma):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        # make difference between algorithm outputs and dataset
        noise = 100 * np.random.random((episode_length, action_size))
        actions = (np.matmul(observations, A) + noise).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, gamma)

    ref_sums = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        ref_sum = ref_discounted_sum_of_advantage_score(
            algo.predict_value, batch.observations, batch.actions,
            policy_actions, gamma)
        ref_sums += ref_sum

    score = discounted_sum_of_advantage_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(ref_sums))
Example #7
0
def test_dynamics_prediction_variance_scorer(observation_shape, action_size,
                                             n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions.astype("f4"),
            rewards.astype("f4"),
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape))

    total_variances = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        _, _, var = dynamics.predict(batch.observations, batch.actions, True)
        total_variances += var.tolist()
    score = dynamics_prediction_variance_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_variances))
Example #8
0
def test_dynamics_observation_prediction_error_scorer(observation_shape,
                                                      action_size, n_episodes,
                                                      episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape))

    total_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        pred_x, _ = dynamics.predict(batch.observations, batch.actions)
        errors = ((batch.next_observations - pred_x)**2).sum(axis=1)
        total_errors += errors.tolist()
    score = dynamics_observation_prediction_error_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_errors))
Example #9
0
def test_continuous_action_diff_scorer(observation_shape, action_size,
                                       n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)

    total_diffs = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        diff = ((batch.actions - policy_actions)**2).sum(axis=1).tolist()
        total_diffs += diff
    score = continuous_action_diff_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(total_diffs))
Example #10
0
def test_transition_minibatch(data_size, observation_size, action_size, gamma):
    observations = np.random.random((data_size, observation_size))
    actions = np.random.random((data_size, action_size))
    rewards = np.random.random((data_size, 1))

    episode = Episode((observation_size, ), action_size, observations, actions,
                      rewards, gamma)

    batch = TransitionMiniBatch(episode.transitions)
    for i, t in enumerate(episode.transitions):
        assert np.all(batch.observations[i] == t.observation)
        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i] == t.reward)
        assert np.all(batch.next_observations[i] == t.next_observation)
        assert np.all(batch.next_actions[i] == t.next_action)
        assert np.all(batch.next_rewards[i] == t.next_reward)
        assert np.all(batch.terminals[i] == t.terminal)
        assert np.all(batch.returns[i] == t.returns)
        assert np.all(
            batch.consequent_observations[i] == t.consequent_observations)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #11
0
def test_value_estimation_std_scorer(observation_shape, action_size,
                                     n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)

    total_stds = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        _, stds = algo.predict_value(batch.observations, policy_actions, True)
        total_stds += stds.tolist()

    score = value_estimation_std_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(total_stds))
Example #12
0
def test_compute_lambda_return(
    data_size, observation_shape, action_size, n_frames, gamma, lam
):
    if len(observation_shape) == 3:
        observations = np.random.randint(
            256, size=(data_size, *observation_shape), dtype=np.uint8
        )
    else:
        observations = np.random.random(
            (data_size,) + observation_shape
        ).astype("f4")
    actions = np.random.random((data_size, action_size)).astype("f4")
    rewards = np.random.random((data_size, 1)).astype("f4")

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations,
        actions=actions,
        rewards=rewards,
    )

    class DummyAlgo:
        def predict_value(self, observations):
            batch_size = observations.shape[0]
            return np.mean(observations.reshape((batch_size, -1)), axis=1)

    algo = DummyAlgo()

    transitions = episode.transitions
    transition = transitions[3]

    # compute reference naively
    t = transition
    observations = []
    returns = []
    R = 0.0
    for i in range(data_size):
        observation = TransitionMiniBatch([t], n_frames).next_observations[0]
        observations.append(observation)
        R += (gamma ** i) * t.next_reward
        returns.append(R)
        t = t.next_transition
        if t is None:
            break
    values = algo.predict_value(np.array(observations))
    values[-1] = 0.0
    gammas = gamma ** (np.arange(len(observations)) + 1)
    returns += gammas * values

    lambdas = lam ** np.arange(len(observations))
    ref_lambda_return = (1.0 - lam) * np.sum(lambdas[:-1] * returns[:-1])
    ref_lambda_return += lambdas[-1] * returns[-1]

    # compute lambda return
    lambda_return = compute_lambda_return(
        transition, algo, gamma, lam, n_frames
    )

    assert np.allclose(ref_lambda_return, lambda_return)
Example #13
0
def test_td_error_scorer(observation_shape, action_size, n_episodes,
                         episode_length, gamma):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, gamma)

    ref_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        ref_error = ref_td_error_score(
            algo.predict_value, batch.observations, batch.actions,
            np.asarray(batch.next_rewards).reshape(-1),
            batch.next_observations, batch.next_actions,
            np.asarray(batch.terminals).reshape(-1), gamma)
        ref_errors += ref_error

    score = td_error_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(ref_errors))
Example #14
0
def test_discrete_action_math_scorer(observation_shape, action_size,
                                     n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.randint(action_size, size=episode_length)
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0, discrete=True)

    total_matches = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        match = (batch.actions.reshape(-1) == policy_actions).tolist()
        total_matches += match
    score = discrete_action_match_scorer(algo, episodes)
    assert np.allclose(score, np.mean(total_matches))
Example #15
0
def test_episode(data_size, observation_size, action_size):
    observations = np.random.random((data_size, observation_size)).astype("f4")
    actions = np.random.random((data_size, action_size)).astype("f4")
    rewards = np.random.random(data_size).astype("f4")

    episode = Episode(
        observation_shape=(observation_size, ),
        action_size=action_size,
        observations=observations,
        actions=actions,
        rewards=rewards,
    )

    # check Episode methods
    assert np.all(episode.observations == observations)
    assert np.all(episode.actions == actions)
    assert np.all(episode.rewards == rewards)
    assert episode.size() == data_size - 1
    assert episode.get_observation_shape() == (observation_size, )
    assert episode.get_action_size() == action_size
    assert episode.compute_return() == np.sum(rewards[1:])

    # check transitions exported from episode
    assert len(episode.transitions) == data_size - 1
    for i, t in enumerate(episode.transitions):
        assert isinstance(t, Transition)
        assert t.get_observation_shape() == (observation_size, )
        assert t.get_action_size() == action_size
        assert np.all(t.observation == observations[i])
        assert np.all(t.action == actions[i])
        assert np.allclose(t.reward, rewards[i])
        assert np.all(t.next_observation == observations[i + 1])
        assert np.all(t.next_action == actions[i + 1])
        assert np.allclose(t.next_reward, rewards[i + 1])
        assert t.terminal == (1.0 if (i == data_size - 2) else 0.0)

    # check forward pointers
    count = 1
    transition = episode[0]
    while transition.next_transition:
        transition = transition.next_transition
        count += 1
    assert count == data_size - 1

    # check backward pointers
    count = 1
    transition = episode[-1]
    while transition.prev_transition:
        transition = transition.prev_transition
        count += 1
    assert count == data_size - 1

    # check list-like bahaviors
    assert len(episode) == data_size - 1
    assert episode[0] is episode.transitions[0]
    for i, transition in enumerate(episode):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #16
0
def test_episode(data_size, observation_size, action_size, gamma):
    observations = np.random.random((data_size, observation_size))
    actions = np.random.random((data_size, action_size))
    rewards = np.random.random((data_size, 1))

    episode = Episode((observation_size, ), action_size, observations, actions,
                      rewards, gamma)

    # check Episode methods
    assert np.all(episode.observations == observations)
    assert np.all(episode.actions == actions)
    assert np.all(episode.rewards == rewards)
    assert episode.size() == data_size - 1
    assert episode.get_observation_shape() == (observation_size, )
    assert episode.get_action_size() == action_size
    assert episode.compute_return() == np.sum(rewards[1:])

    # check transitions exported from episode
    assert len(episode.transitions) == data_size - 1
    for i, t in enumerate(episode.transitions):
        assert isinstance(t, Transition)
        assert t.observation_shape == (observation_size, )
        assert t.action_size == action_size
        assert np.all(t.observation == observations[i])
        assert np.all(t.action == actions[i])
        assert t.reward == rewards[i]
        assert np.all(t.next_observation == observations[i + 1])
        assert np.all(t.next_action == actions[i + 1])
        assert t.next_reward == rewards[i + 1]
        assert t.terminal == (1.0 if (i == data_size - 2) else 0.0)
        assert len(t.returns) == data_size - i - 1
        assert len(t.consequent_observations) == data_size - i - 1

        # check returns
        ref_return = 0.0
        for j, ret in enumerate(t.returns):
            print(t.returns)
            ref_return += (gamma**j) * rewards[i + 1 + j][0]
            assert ret == ref_return

    # check list-like bahaviors
    assert len(episode) == data_size - 1
    assert episode[0] is episode.transitions[0]
    for i, transition in enumerate(episode):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #17
0
def test_dynamics_reward_prediction_error_scorer(
    observation_shape,
    action_size,
    n_episodes,
    episode_length,
    reward_scaler,
):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape),
                             reward_scaler)

    total_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        _, pred_reward = dynamics.predict(batch.observations, batch.actions)
        if reward_scaler:
            next_rewards = reward_scaler.transform_numpy(batch.next_rewards)
        else:
            next_rewards = batch.next_rewards
        errors = ((next_rewards - pred_reward)**2).reshape(-1)
        total_errors += errors.tolist()
    score = dynamics_reward_prediction_error_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_errors))
Example #18
0
def test_round_iterator(
    episode_size,
    n_episodes,
    observation_size,
    action_size,
    batch_size,
    shuffle,
    set_ephemeral,
):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_size, observation_size))
        actions = np.random.random((episode_size, action_size))
        rewards = np.random.random(episode_size)
        episode = Episode((observation_size, ), action_size, observations,
                          actions, rewards)
        episodes.append(episode)

    iterator = RoundIterator(episodes, batch_size, shuffle=shuffle)

    if set_ephemeral:
        iterator.set_ephemeral_transitions(episodes[0].transitions)

    count = 0
    for batch in iterator:
        assert batch.observations.shape == (batch_size, observation_size)
        assert batch.actions.shape == (batch_size, action_size)
        assert batch.rewards.shape == (batch_size, 1)
        count += 1

    if set_ephemeral:
        assert count == episode_size * (n_episodes + 1) // batch_size
        assert len(iterator) == episode_size * (n_episodes + 1) // batch_size
    else:
        assert count == episode_size * n_episodes // batch_size
        assert len(iterator) == episode_size * n_episodes // batch_size
Example #19
0
def test_transition_minibatch(data_size, observation_shape, action_size,
                              n_frames, discrete_action):
    if len(observation_shape) == 3:
        observations = np.random.randint(256,
                                         size=(data_size, *observation_shape),
                                         dtype=np.uint8)
    else:
        observations = np.random.random((data_size, ) +
                                        observation_shape).astype('f4')
    if discrete_action:
        actions = np.random.randint(action_size, size=data_size)
    else:
        actions = np.random.random((data_size, action_size)).astype('f4')
    rewards = np.random.random((data_size, 1)).astype('f4')

    episode = Episode(observation_shape=observation_shape,
                      action_size=action_size,
                      observations=observations,
                      actions=actions,
                      rewards=rewards)

    if len(observation_shape) == 3:
        n_channels = n_frames * observation_shape[0]
        image_size = observation_shape[1:]
        batched_observation_shape = (data_size - 1, n_channels, *image_size)
    else:
        batched_observation_shape = (data_size - 1, *observation_shape)

    # create padded observations for check stacking
    padding = np.zeros((n_frames - 1, *observation_shape), dtype=np.uint8)
    padded_observations = np.vstack([padding, observations])

    batch = TransitionMiniBatch(episode.transitions, n_frames)
    assert batch.observations.shape == batched_observation_shape
    assert batch.next_observations.shape == batched_observation_shape
    for i, t in enumerate(episode.transitions):
        observation = batch.observations[i]
        next_observation = batch.next_observations[i]

        if n_frames > 1 and len(observation_shape) == 3:
            # check frame stacking
            head_index = i
            tail_index = head_index + n_frames
            window = padded_observations[head_index:tail_index]
            next_window = padded_observations[head_index + 1:tail_index + 1]
            ref_observation = np.vstack(window)
            ref_next_observation = np.vstack(next_window)
            assert observation.shape == ref_observation.shape
            assert next_observation.shape == ref_next_observation.shape
            assert np.all(observation == ref_observation)
            assert np.all(next_observation == ref_next_observation)
        else:
            assert np.allclose(observation, t.observation)
            assert np.allclose(next_observation, t.next_observation)

        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i][0] == t.reward)
        assert np.all(batch.next_actions[i] == t.next_action)
        assert np.all(batch.next_rewards[i][0] == t.next_reward)
        assert np.all(batch.terminals[i][0] == t.terminal)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #20
0
def test_transition_minibatch(
    data_size,
    observation_shape,
    action_size,
    n_frames,
    n_steps,
    gamma,
    discrete_action,
    create_mask,
    mask_size,
):
    if len(observation_shape) == 3:
        observations = np.random.randint(256,
                                         size=(data_size, *observation_shape),
                                         dtype=np.uint8)
    else:
        observations = np.random.random((data_size, ) +
                                        observation_shape).astype("f4")
    if discrete_action:
        actions = np.random.randint(action_size, size=data_size)
    else:
        actions = np.random.random((data_size, action_size)).astype("f4")
    rewards = np.random.random((data_size, 1)).astype("f4")

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations,
        actions=actions,
        rewards=rewards,
        create_mask=create_mask,
        mask_size=mask_size,
    )

    if len(observation_shape) == 3:
        n_channels = n_frames * observation_shape[0]
        image_size = observation_shape[1:]
        batched_observation_shape = (data_size - 1, n_channels, *image_size)
    else:
        batched_observation_shape = (data_size - 1, *observation_shape)

    batch = TransitionMiniBatch(episode.transitions, n_frames, n_steps, gamma)
    assert batch.observations.shape == batched_observation_shape
    assert batch.next_observations.shape == batched_observation_shape

    for i, t in enumerate(episode.transitions):
        observation = batch.observations[i]
        next_observation = batch.next_observations[i]
        n = int(batch.n_steps[i][0])

        assert n == min(data_size - i - 1, n_steps)

        if n_frames > 1 and len(observation_shape) == 3:
            # create padded observations for check stacking
            pad = ((n_frames - 1, 0), (0, 0), (0, 0), (0, 0))
            padded_observations = np.pad(observations, pad, "edge")

            # check frame stacking
            head_index = i
            tail_index = head_index + n_frames
            window = padded_observations[head_index:tail_index]
            next_window = padded_observations[head_index + n:tail_index + n]
            ref_observation = np.vstack(window)
            ref_next_observation = np.vstack(next_window)
            assert observation.shape == ref_observation.shape
            assert next_observation.shape == ref_next_observation.shape
            assert np.all(observation == ref_observation)
            assert np.all(next_observation == ref_next_observation)
        else:
            next_t = t
            for _ in range(n - 1):
                next_t = next_t.next_transition
            assert np.allclose(observation, t.observation)
            assert np.allclose(next_observation, next_t.next_observation)

        next_reward = 0.0
        next_action = 0.0
        terminal = 0.0
        next_t = t
        for j in range(n):
            next_reward += next_t.next_reward * gamma**j
            next_action = next_t.next_action
            terminal = next_t.terminal
            next_t = next_t.next_transition

        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i][0] == t.reward)
        assert np.all(batch.next_actions[i] == next_action)
        assert np.allclose(batch.next_rewards[i][0], next_reward)
        assert np.all(batch.terminals[i][0] == terminal)

    # check mask
    if create_mask:
        assert batch.masks.shape == (mask_size, data_size - 1, 1)
    else:
        assert batch.masks is None

    # check additional data
    value = np.random.random(100)
    batch.add_additional_data("test", value)
    assert np.all(batch.get_additional_data("test") == value)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #21
0
def test_random_iterator(
    episode_size,
    n_steps_per_epoch,
    n_episodes,
    observation_size,
    action_size,
    batch_size,
    real_ratio,
    generated_maxlen,
):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_size, observation_size))
        actions = np.random.random((episode_size, action_size))
        rewards = np.random.random(episode_size)
        episode = Episode(
            (observation_size, ),
            action_size,
            observations,
            actions,
            rewards,
            terminal=False,
        )
        episodes.append(episode)

    iterator = RandomIterator(
        episodes,
        n_steps_per_epoch,
        batch_size,
        real_ratio=real_ratio,
        generated_maxlen=generated_maxlen,
    )

    # check without generated transitions
    count = 0
    for batch in iterator:
        assert batch.observations.shape == (batch_size, observation_size)
        assert batch.actions.shape == (batch_size, action_size)
        assert batch.rewards.shape == (batch_size, 1)
        count += 1
    assert count == n_steps_per_epoch
    assert len(iterator) == n_steps_per_epoch

    # check adding generated transitions
    transitions = []
    for _ in range(episode_size):
        transition = Transition(
            (observation_size, ),
            action_size,
            np.random.random(observation_size),
            np.random.random(action_size),
            np.random.random(),
            np.random.random(observation_size),
            np.random.random(action_size),
            np.random.random(),
            terminal=True,
        )
        transitions.append(transition)
    iterator.add_generated_transitions(transitions)
    assert len(iterator.generated_transitions) == generated_maxlen

    # check with generated transitions
    count = 0
    for batch in iterator:
        assert batch.observations.shape == (batch_size, observation_size)
        assert batch.actions.shape == (batch_size, action_size)
        assert batch.rewards.shape == (batch_size, 1)
        assert batch.terminals.sum() == int(batch_size * (1 - real_ratio))
        count += 1
    assert count == n_steps_per_epoch
    assert len(iterator) == n_steps_per_epoch