Example #1
0
def test_compare_discrete_action_diff(observation_shape, action_size,
                                      n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    A1 = np.random.random(observation_shape + (action_size, ))
    A2 = np.random.random(observation_shape + (action_size, ))
    algo = DummyAlgo(A1, 0.0, discrete=True)
    base_algo = DummyAlgo(A2, 0.0, discrete=True)

    total_matches = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        actions = algo.predict(batch.observations)
        base_actions = base_algo.predict(batch.observations)
        match = (actions == base_actions).tolist()
        total_matches += match

    score = compare_discrete_action_match(base_algo)(algo, episodes)
    assert np.allclose(score, np.mean(total_matches))
Example #2
0
def test_compare_continuous_action_diff(observation_shape, action_size,
                                        n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    A1 = np.random.random(observation_shape + (action_size, ))
    A2 = np.random.random(observation_shape + (action_size, ))
    algo = DummyAlgo(A1, 0.0)
    base_algo = DummyAlgo(A2, 0.0)

    total_diffs = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        actions = algo.predict(batch.observations)
        base_actions = base_algo.predict(batch.observations)
        diff = ((actions - base_actions)**2).sum(axis=1).tolist()
        total_diffs += diff

    score = compare_continuous_action_diff(base_algo)(algo, episodes)
    assert np.allclose(score, -np.mean(total_diffs))
Example #3
0
def test_value_estimation_std_scorer(observation_shape, action_size,
                                     n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)

    total_stds = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        _, stds = algo.predict_value(batch.observations, policy_actions, True)
        total_stds += stds.tolist()

    score = value_estimation_std_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(total_stds))
Example #4
0
def test_soft_opc_scorer(observation_shape, action_size, n_episodes,
                         episode_length, threshold):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)
    success_values = []
    all_values = []
    for episode in episodes:
        is_success = episode.compute_return() >= threshold
        batch = TransitionMiniBatch(episode.transitions)
        values = algo.predict_value(batch.observations, batch.actions)
        if is_success:
            success_values += values.tolist()
        all_values += values.tolist()

    scorer = soft_opc_scorer(threshold)
    score = scorer(algo, episodes)
    assert np.allclose(score, np.mean(success_values) - np.mean(all_values))
Example #5
0
def test_discounted_sum_of_advantage_scorer(observation_shape, action_size,
                                            n_episodes, episode_length, gamma):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        # make difference between algorithm outputs and dataset
        noise = 100 * np.random.random((episode_length, action_size))
        actions = (np.matmul(observations, A) + noise).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, gamma)

    ref_sums = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        ref_sum = ref_discounted_sum_of_advantage_score(
            algo.predict_value, batch.observations, batch.actions,
            policy_actions, gamma)
        ref_sums += ref_sum

    score = discounted_sum_of_advantage_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(ref_sums))
Example #6
0
def test_continuous_action_diff_scorer(observation_shape, action_size,
                                       n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0)

    total_diffs = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        diff = ((batch.actions - policy_actions)**2).sum(axis=1).tolist()
        total_diffs += diff
    score = continuous_action_diff_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(total_diffs))
Example #7
0
def test_dynamics_prediction_variance_scorer(observation_shape, action_size,
                                             n_episodes, episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size))
        rewards = np.random.random((episode_length, 1))
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions.astype("f4"),
            rewards.astype("f4"),
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape))

    total_variances = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        _, _, var = dynamics.predict(batch.observations, batch.actions, True)
        total_variances += var.tolist()
    score = dynamics_prediction_variance_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_variances))
Example #8
0
def test_dynamics_observation_prediction_error_scorer(observation_shape,
                                                      action_size, n_episodes,
                                                      episode_length):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape))

    total_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        pred_x, _ = dynamics.predict(batch.observations, batch.actions)
        errors = ((batch.next_observations - pred_x)**2).sum(axis=1)
        total_errors += errors.tolist()
    score = dynamics_observation_prediction_error_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_errors))
Example #9
0
def base_update_tester(model, observation_shape, action_size, discrete=False):
    # make mini-batch
    transitions = []
    for _ in range(model.batch_size):
        observation = np.random.random(observation_shape)
        next_observation = np.random.random(observation_shape)
        reward = np.random.random()
        next_reward = np.random.random()
        terminal = np.random.randint(2)
        returns = np.random.random(100)
        consequent_observations = np.random.random((100, *observation_shape))
        if discrete:
            action = np.random.randint(action_size)
            next_action = np.random.randint(action_size)
        else:
            action = np.random.random(action_size)
            next_action = np.random.random(action_size)
        transition = Transition(observation_shape, action_size, observation,
                                action, reward, next_observation, next_action,
                                next_reward, terminal, returns,
                                consequent_observations)
        transitions.append(transition)

    batch = TransitionMiniBatch(transitions)

    # check if update runs without errors
    model.create_impl(observation_shape, action_size)
    loss = model.update(0, 0, batch)

    assert len(loss) == len(model._get_loss_labels())

    return transitions
Example #10
0
def test_discrete_action_math_scorer(observation_shape, action_size,
                                     n_episodes, episode_length):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.randint(action_size, size=episode_length)
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    algo = DummyAlgo(A, 0.0, discrete=True)

    total_matches = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        policy_actions = algo.predict(batch.observations)
        match = (batch.actions.reshape(-1) == policy_actions).tolist()
        total_matches += match
    score = discrete_action_match_scorer(algo, episodes)
    assert np.allclose(score, np.mean(total_matches))
Example #11
0
def test_td_error_scorer(observation_shape, action_size, n_episodes,
                         episode_length, gamma):
    # projection matrix for deterministic action
    A = np.random.random(observation_shape + (action_size, ))
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.matmul(observations, A).astype('f4')
        rewards = np.random.random((episode_length, 1)).astype('f4')
        episode = Episode(observation_shape, action_size,
                          observations.astype('f4'), actions, rewards)
        episodes.append(episode)

    algo = DummyAlgo(A, gamma)

    ref_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        ref_error = ref_td_error_score(
            algo.predict_value, batch.observations, batch.actions,
            np.asarray(batch.next_rewards).reshape(-1),
            batch.next_observations, batch.next_actions,
            np.asarray(batch.terminals).reshape(-1), gamma)
        ref_errors += ref_error

    score = td_error_scorer(algo, episodes)
    assert np.allclose(score, -np.mean(ref_errors))
Example #12
0
def test_transition_minibatch(data_size, observation_size, action_size, gamma):
    observations = np.random.random((data_size, observation_size))
    actions = np.random.random((data_size, action_size))
    rewards = np.random.random((data_size, 1))

    episode = Episode((observation_size, ), action_size, observations, actions,
                      rewards, gamma)

    batch = TransitionMiniBatch(episode.transitions)
    for i, t in enumerate(episode.transitions):
        assert np.all(batch.observations[i] == t.observation)
        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i] == t.reward)
        assert np.all(batch.next_observations[i] == t.next_observation)
        assert np.all(batch.next_actions[i] == t.next_action)
        assert np.all(batch.next_rewards[i] == t.next_reward)
        assert np.all(batch.terminals[i] == t.terminal)
        assert np.all(batch.returns[i] == t.returns)
        assert np.all(
            batch.consequent_observations[i] == t.consequent_observations)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #13
0
def test_compute_lambda_return(
    data_size, observation_shape, action_size, n_frames, gamma, lam
):
    if len(observation_shape) == 3:
        observations = np.random.randint(
            256, size=(data_size, *observation_shape), dtype=np.uint8
        )
    else:
        observations = np.random.random(
            (data_size,) + observation_shape
        ).astype("f4")
    actions = np.random.random((data_size, action_size)).astype("f4")
    rewards = np.random.random((data_size, 1)).astype("f4")

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations,
        actions=actions,
        rewards=rewards,
    )

    class DummyAlgo:
        def predict_value(self, observations):
            batch_size = observations.shape[0]
            return np.mean(observations.reshape((batch_size, -1)), axis=1)

    algo = DummyAlgo()

    transitions = episode.transitions
    transition = transitions[3]

    # compute reference naively
    t = transition
    observations = []
    returns = []
    R = 0.0
    for i in range(data_size):
        observation = TransitionMiniBatch([t], n_frames).next_observations[0]
        observations.append(observation)
        R += (gamma ** i) * t.next_reward
        returns.append(R)
        t = t.next_transition
        if t is None:
            break
    values = algo.predict_value(np.array(observations))
    values[-1] = 0.0
    gammas = gamma ** (np.arange(len(observations)) + 1)
    returns += gammas * values

    lambdas = lam ** np.arange(len(observations))
    ref_lambda_return = (1.0 - lam) * np.sum(lambdas[:-1] * returns[:-1])
    ref_lambda_return += lambdas[-1] * returns[-1]

    # compute lambda return
    lambda_return = compute_lambda_return(
        transition, algo, gamma, lam, n_frames
    )

    assert np.allclose(ref_lambda_return, lambda_return)
Example #14
0
def base_update_tester(model, observation_shape, action_size, discrete=False):
    # make mini-batch
    transitions = []
    prev_transition = None
    for i in range(model.batch_size):
        if len(observation_shape) == 3:
            observation = np.random.randint(256,
                                            size=observation_shape,
                                            dtype=np.uint8)
            next_observation = np.random.randint(256,
                                                 size=observation_shape,
                                                 dtype=np.uint8)
        else:
            observation = np.random.random(observation_shape).astype("f4")
            next_observation = np.random.random(observation_shape).astype("f4")
        reward = np.random.random()
        next_reward = np.random.random()
        terminal = np.random.randint(2)
        if discrete:
            action = np.random.randint(action_size)
            next_action = np.random.randint(action_size)
        else:
            action = np.random.random(action_size).astype("f4")
            next_action = np.random.random(action_size).astype("f4")

        transition = Transition(
            observation_shape=observation_shape,
            action_size=action_size,
            observation=observation,
            action=action,
            reward=reward,
            next_observation=next_observation,
            next_action=next_action,
            next_reward=next_reward,
            terminal=terminal,
            prev_transition=prev_transition,
        )

        # set transition to the next pointer
        if prev_transition:
            prev_transition.next_transition = transition

        prev_transition = transition

        transitions.append(transition)

    batch = TransitionMiniBatch(transitions)

    # check if update runs without errors
    model.create_impl(observation_shape, action_size)
    loss = model.update(0, 0, batch)

    assert len(loss.items()) > 0

    return transitions
Example #15
0
def test_dynamics_reward_prediction_error_scorer(
    observation_shape,
    action_size,
    n_episodes,
    episode_length,
    reward_scaler,
):
    episodes = []
    for _ in range(n_episodes):
        observations = np.random.random((episode_length, ) + observation_shape)
        actions = np.random.random((episode_length, action_size)).astype("f4")
        rewards = np.random.random((episode_length, 1)).astype("f4")
        episode = Episode(
            observation_shape,
            action_size,
            observations.astype("f4"),
            actions,
            rewards,
        )
        episodes.append(episode)

    dynamics = DummyDynamics(np.random.random(observation_shape),
                             reward_scaler)

    total_errors = []
    for episode in episodes:
        batch = TransitionMiniBatch(episode.transitions)
        _, pred_reward = dynamics.predict(batch.observations, batch.actions)
        if reward_scaler:
            next_rewards = reward_scaler.transform_numpy(batch.next_rewards)
        else:
            next_rewards = batch.next_rewards
        errors = ((next_rewards - pred_reward)**2).reshape(-1)
        total_errors += errors.tolist()
    score = dynamics_reward_prediction_error_scorer(dynamics, episodes)
    assert np.allclose(score, -np.mean(total_errors))
Example #16
0
def test_torch_api_with_batch(
    batch_size,
    observation_shape,
    action_size,
    use_scaler,
    use_action_scaler,
    use_reward_scaler,
):
    obs_shape = (batch_size,) + observation_shape
    transitions = []
    for _ in range(batch_size):
        transition = Transition(
            observation_shape=observation_shape,
            action_size=action_size,
            observation=np.random.random(obs_shape),
            action=np.random.random(action_size),
            reward=np.random.random(),
            next_observation=np.random.random(obs_shape),
            next_action=np.random.random(action_size),
            next_reward=np.random.random(),
            terminal=0.0,
        )
        transitions.append(transition)

    if use_scaler:

        class DummyScaler:
            def transform(self, x):
                return x + 0.1

        scaler = DummyScaler()
    else:
        scaler = None

    if use_action_scaler:

        class DummyActionScaler:
            def transform(self, x):
                return x + 0.2

        action_scaler = DummyActionScaler()
    else:
        action_scaler = None

    if use_reward_scaler:

        class DummyRewardScaler:
            def transform(self, x):
                return x + 0.2

        reward_scaler = DummyRewardScaler()
    else:
        reward_scaler = None

    batch = TransitionMiniBatch(transitions)

    impl = DummyImpl()
    impl._scaler = scaler
    impl._action_scaler = action_scaler
    impl._reward_scaler = reward_scaler

    torch_batch = impl.torch_api_func_with_batch(batch)

    if use_scaler:
        assert np.all(
            torch_batch.observations.numpy() == batch.observations + 0.1
        )
        assert np.all(
            torch_batch.next_observations.numpy()
            == batch.next_observations + 0.1
        )
    else:
        assert np.all(torch_batch.observations.numpy() == batch.observations)
        assert np.all(
            torch_batch.next_observations.numpy() == batch.next_observations
        )

    if use_action_scaler:
        assert np.all(torch_batch.actions.numpy() == batch.actions + 0.2)
        assert np.all(
            torch_batch.next_actions.numpy() == batch.next_actions + 0.2
        )
    else:
        assert np.all(torch_batch.actions.numpy() == batch.actions)
        assert np.all(torch_batch.next_actions.numpy() == batch.next_actions)

    if use_reward_scaler:
        assert np.all(torch_batch.rewards.numpy() == batch.rewards + 0.2)
        assert np.all(
            torch_batch.next_rewards.numpy() == batch.next_rewards + 0.2
        )
    else:
        assert np.all(torch_batch.rewards.numpy() == batch.rewards)
        assert np.all(torch_batch.next_rewards.numpy() == batch.next_rewards)

    assert np.all(torch_batch.terminals.numpy() == batch.terminals)
    assert np.all(torch_batch.n_steps.numpy() == batch.n_steps)
Example #17
0
def test_transition_minibatch(data_size, observation_shape, action_size,
                              n_frames, discrete_action):
    if len(observation_shape) == 3:
        observations = np.random.randint(256,
                                         size=(data_size, *observation_shape),
                                         dtype=np.uint8)
    else:
        observations = np.random.random((data_size, ) +
                                        observation_shape).astype('f4')
    if discrete_action:
        actions = np.random.randint(action_size, size=data_size)
    else:
        actions = np.random.random((data_size, action_size)).astype('f4')
    rewards = np.random.random((data_size, 1)).astype('f4')

    episode = Episode(observation_shape=observation_shape,
                      action_size=action_size,
                      observations=observations,
                      actions=actions,
                      rewards=rewards)

    if len(observation_shape) == 3:
        n_channels = n_frames * observation_shape[0]
        image_size = observation_shape[1:]
        batched_observation_shape = (data_size - 1, n_channels, *image_size)
    else:
        batched_observation_shape = (data_size - 1, *observation_shape)

    # create padded observations for check stacking
    padding = np.zeros((n_frames - 1, *observation_shape), dtype=np.uint8)
    padded_observations = np.vstack([padding, observations])

    batch = TransitionMiniBatch(episode.transitions, n_frames)
    assert batch.observations.shape == batched_observation_shape
    assert batch.next_observations.shape == batched_observation_shape
    for i, t in enumerate(episode.transitions):
        observation = batch.observations[i]
        next_observation = batch.next_observations[i]

        if n_frames > 1 and len(observation_shape) == 3:
            # check frame stacking
            head_index = i
            tail_index = head_index + n_frames
            window = padded_observations[head_index:tail_index]
            next_window = padded_observations[head_index + 1:tail_index + 1]
            ref_observation = np.vstack(window)
            ref_next_observation = np.vstack(next_window)
            assert observation.shape == ref_observation.shape
            assert next_observation.shape == ref_next_observation.shape
            assert np.all(observation == ref_observation)
            assert np.all(next_observation == ref_next_observation)
        else:
            assert np.allclose(observation, t.observation)
            assert np.allclose(next_observation, t.next_observation)

        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i][0] == t.reward)
        assert np.all(batch.next_actions[i] == t.next_action)
        assert np.all(batch.next_rewards[i][0] == t.next_reward)
        assert np.all(batch.terminals[i][0] == t.terminal)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #18
0
def test_transition_minibatch(
    data_size,
    observation_shape,
    action_size,
    n_frames,
    n_steps,
    gamma,
    discrete_action,
    create_mask,
    mask_size,
):
    if len(observation_shape) == 3:
        observations = np.random.randint(256,
                                         size=(data_size, *observation_shape),
                                         dtype=np.uint8)
    else:
        observations = np.random.random((data_size, ) +
                                        observation_shape).astype("f4")
    if discrete_action:
        actions = np.random.randint(action_size, size=data_size)
    else:
        actions = np.random.random((data_size, action_size)).astype("f4")
    rewards = np.random.random((data_size, 1)).astype("f4")

    episode = Episode(
        observation_shape=observation_shape,
        action_size=action_size,
        observations=observations,
        actions=actions,
        rewards=rewards,
        create_mask=create_mask,
        mask_size=mask_size,
    )

    if len(observation_shape) == 3:
        n_channels = n_frames * observation_shape[0]
        image_size = observation_shape[1:]
        batched_observation_shape = (data_size - 1, n_channels, *image_size)
    else:
        batched_observation_shape = (data_size - 1, *observation_shape)

    batch = TransitionMiniBatch(episode.transitions, n_frames, n_steps, gamma)
    assert batch.observations.shape == batched_observation_shape
    assert batch.next_observations.shape == batched_observation_shape

    for i, t in enumerate(episode.transitions):
        observation = batch.observations[i]
        next_observation = batch.next_observations[i]
        n = int(batch.n_steps[i][0])

        assert n == min(data_size - i - 1, n_steps)

        if n_frames > 1 and len(observation_shape) == 3:
            # create padded observations for check stacking
            pad = ((n_frames - 1, 0), (0, 0), (0, 0), (0, 0))
            padded_observations = np.pad(observations, pad, "edge")

            # check frame stacking
            head_index = i
            tail_index = head_index + n_frames
            window = padded_observations[head_index:tail_index]
            next_window = padded_observations[head_index + n:tail_index + n]
            ref_observation = np.vstack(window)
            ref_next_observation = np.vstack(next_window)
            assert observation.shape == ref_observation.shape
            assert next_observation.shape == ref_next_observation.shape
            assert np.all(observation == ref_observation)
            assert np.all(next_observation == ref_next_observation)
        else:
            next_t = t
            for _ in range(n - 1):
                next_t = next_t.next_transition
            assert np.allclose(observation, t.observation)
            assert np.allclose(next_observation, next_t.next_observation)

        next_reward = 0.0
        next_action = 0.0
        terminal = 0.0
        next_t = t
        for j in range(n):
            next_reward += next_t.next_reward * gamma**j
            next_action = next_t.next_action
            terminal = next_t.terminal
            next_t = next_t.next_transition

        assert np.all(batch.actions[i] == t.action)
        assert np.all(batch.rewards[i][0] == t.reward)
        assert np.all(batch.next_actions[i] == next_action)
        assert np.allclose(batch.next_rewards[i][0], next_reward)
        assert np.all(batch.terminals[i][0] == terminal)

    # check mask
    if create_mask:
        assert batch.masks.shape == (mask_size, data_size - 1, 1)
    else:
        assert batch.masks is None

    # check additional data
    value = np.random.random(100)
    batch.add_additional_data("test", value)
    assert np.all(batch.get_additional_data("test") == value)

    # check list-like behavior
    assert len(batch) == data_size - 1
    assert batch[0] is episode.transitions[0]
    for i, transition in enumerate(batch):
        assert isinstance(transition, Transition)
        assert transition is episode.transitions[i]
Example #19
0
def test_torch_mini_batch(
    batch_size, observation_shape, action_size, use_scaler, use_action_scaler
):
    obs_shape = (batch_size,) + observation_shape
    transitions = []
    for _ in range(batch_size):
        transition = Transition(
            observation_shape=observation_shape,
            action_size=action_size,
            observation=np.random.random(obs_shape),
            action=np.random.random(action_size),
            reward=np.random.random(),
            next_observation=np.random.random(obs_shape),
            next_action=np.random.random(action_size),
            next_reward=np.random.random(),
            terminal=0.0,
        )
        transitions.append(transition)

    if use_scaler:

        class DummyScaler:
            def transform(self, x):
                return x + 0.1

        scaler = DummyScaler()
    else:
        scaler = None

    if use_action_scaler:

        class DummyActionScaler:
            def transform(self, x):
                return x + 0.2

        action_scaler = DummyActionScaler()
    else:
        action_scaler = None

    batch = TransitionMiniBatch(transitions)

    torch_batch = TorchMiniBatch(
        batch=batch, device="cpu:0", scaler=scaler, action_scaler=action_scaler
    )

    if use_scaler:
        assert np.all(
            torch_batch.observations.numpy() == batch.observations + 0.1
        )
        assert np.all(
            torch_batch.next_observations.numpy()
            == batch.next_observations + 0.1
        )
    else:
        assert np.all(torch_batch.observations.numpy() == batch.observations)
        assert np.all(
            torch_batch.next_observations.numpy() == batch.next_observations
        )

    if use_action_scaler:
        assert np.all(torch_batch.actions.numpy() == batch.actions + 0.2)
        assert np.all(
            torch_batch.next_actions.numpy() == batch.next_actions + 0.2
        )
    else:
        assert np.all(torch_batch.actions.numpy() == batch.actions)
        assert np.all(torch_batch.next_actions.numpy() == batch.next_actions)

    assert np.all(torch_batch.rewards.numpy() == batch.rewards)
    assert np.all(torch_batch.next_rewards.numpy() == batch.next_rewards)
    assert np.all(torch_batch.terminals.numpy() == batch.terminals)
    assert np.all(torch_batch.n_steps.numpy() == batch.n_steps)