def test_replay_buffer_with_episode(maxlen, data_size): env = gym.make("CartPole-v0") observation_shape = env.observation_space.shape action_size = env.action_space.n observations = np.random.random((data_size, *observation_shape)) actions = np.random.randint(action_size, size=data_size, dtype=np.int32) rewards = np.random.random(data_size) episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations.astype("f4"), actions=actions, rewards=rewards.astype("f4"), ) buffer = ReplayBuffer(maxlen, env, episodes=[episode]) # check episode initialization assert len(buffer) == data_size - 1 # check append_episode buffer.append_episode(episode) assert len(buffer) == 2 * (data_size - 1)
def test_initial_state_value_estimation_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0) total_values = [] for episode in episodes: observation = episode.observations[0].reshape(1, -1) policy_actions = algo.predict(observation) values = algo.predict_value(observation, policy_actions) total_values.append(values) score = initial_state_value_estimation_scorer(algo, episodes) assert np.allclose(score, np.mean(total_values))
def test_compare_continuous_action_diff(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) A1 = np.random.random(observation_shape + (action_size, )) A2 = np.random.random(observation_shape + (action_size, )) algo = DummyAlgo(A1, 0.0) base_algo = DummyAlgo(A2, 0.0) total_diffs = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) actions = algo.predict(batch.observations) base_actions = base_algo.predict(batch.observations) diff = ((actions - base_actions)**2).sum(axis=1).tolist() total_diffs += diff score = compare_continuous_action_diff(base_algo)(algo, episodes) assert np.allclose(score, -np.mean(total_diffs))
def test_soft_opc_scorer(observation_shape, action_size, n_episodes, episode_length, threshold): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, 0.0) success_values = [] all_values = [] for episode in episodes: is_success = episode.compute_return() >= threshold batch = TransitionMiniBatch(episode.transitions) values = algo.predict_value(batch.observations, batch.actions) if is_success: success_values += values.tolist() all_values += values.tolist() scorer = soft_opc_scorer(threshold) score = scorer(algo, episodes) assert np.allclose(score, np.mean(success_values) - np.mean(all_values))
def test_compare_discrete_action_diff(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) A1 = np.random.random(observation_shape + (action_size, )) A2 = np.random.random(observation_shape + (action_size, )) algo = DummyAlgo(A1, 0.0, discrete=True) base_algo = DummyAlgo(A2, 0.0, discrete=True) total_matches = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) actions = algo.predict(batch.observations) base_actions = base_algo.predict(batch.observations) match = (actions == base_actions).tolist() total_matches += match score = compare_discrete_action_match(base_algo)(algo, episodes) assert np.allclose(score, np.mean(total_matches))
def test_discounted_sum_of_advantage_scorer(observation_shape, action_size, n_episodes, episode_length, gamma): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) # make difference between algorithm outputs and dataset noise = 100 * np.random.random((episode_length, action_size)) actions = (np.matmul(observations, A) + noise).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, gamma) ref_sums = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) ref_sum = ref_discounted_sum_of_advantage_score( algo.predict_value, batch.observations, batch.actions, policy_actions, gamma) ref_sums += ref_sum score = discounted_sum_of_advantage_scorer(algo, episodes) assert np.allclose(score, -np.mean(ref_sums))
def test_dynamics_prediction_variance_scorer(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions.astype("f4"), rewards.astype("f4"), ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape)) total_variances = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) _, _, var = dynamics.predict(batch.observations, batch.actions, True) total_variances += var.tolist() score = dynamics_prediction_variance_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_variances))
def test_dynamics_observation_prediction_error_scorer(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape)) total_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) pred_x, _ = dynamics.predict(batch.observations, batch.actions) errors = ((batch.next_observations - pred_x)**2).sum(axis=1) total_errors += errors.tolist() score = dynamics_observation_prediction_error_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_errors))
def test_continuous_action_diff_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0) total_diffs = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) diff = ((batch.actions - policy_actions)**2).sum(axis=1).tolist() total_diffs += diff score = continuous_action_diff_scorer(algo, episodes) assert np.allclose(score, -np.mean(total_diffs))
def test_transition_minibatch(data_size, observation_size, action_size, gamma): observations = np.random.random((data_size, observation_size)) actions = np.random.random((data_size, action_size)) rewards = np.random.random((data_size, 1)) episode = Episode((observation_size, ), action_size, observations, actions, rewards, gamma) batch = TransitionMiniBatch(episode.transitions) for i, t in enumerate(episode.transitions): assert np.all(batch.observations[i] == t.observation) assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i] == t.reward) assert np.all(batch.next_observations[i] == t.next_observation) assert np.all(batch.next_actions[i] == t.next_action) assert np.all(batch.next_rewards[i] == t.next_reward) assert np.all(batch.terminals[i] == t.terminal) assert np.all(batch.returns[i] == t.returns) assert np.all( batch.consequent_observations[i] == t.consequent_observations) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_value_estimation_std_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0) total_stds = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) _, stds = algo.predict_value(batch.observations, policy_actions, True) total_stds += stds.tolist() score = value_estimation_std_scorer(algo, episodes) assert np.allclose(score, -np.mean(total_stds))
def test_compute_lambda_return( data_size, observation_shape, action_size, n_frames, gamma, lam ): if len(observation_shape) == 3: observations = np.random.randint( 256, size=(data_size, *observation_shape), dtype=np.uint8 ) else: observations = np.random.random( (data_size,) + observation_shape ).astype("f4") actions = np.random.random((data_size, action_size)).astype("f4") rewards = np.random.random((data_size, 1)).astype("f4") episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards, ) class DummyAlgo: def predict_value(self, observations): batch_size = observations.shape[0] return np.mean(observations.reshape((batch_size, -1)), axis=1) algo = DummyAlgo() transitions = episode.transitions transition = transitions[3] # compute reference naively t = transition observations = [] returns = [] R = 0.0 for i in range(data_size): observation = TransitionMiniBatch([t], n_frames).next_observations[0] observations.append(observation) R += (gamma ** i) * t.next_reward returns.append(R) t = t.next_transition if t is None: break values = algo.predict_value(np.array(observations)) values[-1] = 0.0 gammas = gamma ** (np.arange(len(observations)) + 1) returns += gammas * values lambdas = lam ** np.arange(len(observations)) ref_lambda_return = (1.0 - lam) * np.sum(lambdas[:-1] * returns[:-1]) ref_lambda_return += lambdas[-1] * returns[-1] # compute lambda return lambda_return = compute_lambda_return( transition, algo, gamma, lam, n_frames ) assert np.allclose(ref_lambda_return, lambda_return)
def test_td_error_scorer(observation_shape, action_size, n_episodes, episode_length, gamma): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, gamma) ref_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) ref_error = ref_td_error_score( algo.predict_value, batch.observations, batch.actions, np.asarray(batch.next_rewards).reshape(-1), batch.next_observations, batch.next_actions, np.asarray(batch.terminals).reshape(-1), gamma) ref_errors += ref_error score = td_error_scorer(algo, episodes) assert np.allclose(score, -np.mean(ref_errors))
def test_discrete_action_math_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.randint(action_size, size=episode_length) rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0, discrete=True) total_matches = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) match = (batch.actions.reshape(-1) == policy_actions).tolist() total_matches += match score = discrete_action_match_scorer(algo, episodes) assert np.allclose(score, np.mean(total_matches))
def test_episode(data_size, observation_size, action_size): observations = np.random.random((data_size, observation_size)).astype("f4") actions = np.random.random((data_size, action_size)).astype("f4") rewards = np.random.random(data_size).astype("f4") episode = Episode( observation_shape=(observation_size, ), action_size=action_size, observations=observations, actions=actions, rewards=rewards, ) # check Episode methods assert np.all(episode.observations == observations) assert np.all(episode.actions == actions) assert np.all(episode.rewards == rewards) assert episode.size() == data_size - 1 assert episode.get_observation_shape() == (observation_size, ) assert episode.get_action_size() == action_size assert episode.compute_return() == np.sum(rewards[1:]) # check transitions exported from episode assert len(episode.transitions) == data_size - 1 for i, t in enumerate(episode.transitions): assert isinstance(t, Transition) assert t.get_observation_shape() == (observation_size, ) assert t.get_action_size() == action_size assert np.all(t.observation == observations[i]) assert np.all(t.action == actions[i]) assert np.allclose(t.reward, rewards[i]) assert np.all(t.next_observation == observations[i + 1]) assert np.all(t.next_action == actions[i + 1]) assert np.allclose(t.next_reward, rewards[i + 1]) assert t.terminal == (1.0 if (i == data_size - 2) else 0.0) # check forward pointers count = 1 transition = episode[0] while transition.next_transition: transition = transition.next_transition count += 1 assert count == data_size - 1 # check backward pointers count = 1 transition = episode[-1] while transition.prev_transition: transition = transition.prev_transition count += 1 assert count == data_size - 1 # check list-like bahaviors assert len(episode) == data_size - 1 assert episode[0] is episode.transitions[0] for i, transition in enumerate(episode): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_episode(data_size, observation_size, action_size, gamma): observations = np.random.random((data_size, observation_size)) actions = np.random.random((data_size, action_size)) rewards = np.random.random((data_size, 1)) episode = Episode((observation_size, ), action_size, observations, actions, rewards, gamma) # check Episode methods assert np.all(episode.observations == observations) assert np.all(episode.actions == actions) assert np.all(episode.rewards == rewards) assert episode.size() == data_size - 1 assert episode.get_observation_shape() == (observation_size, ) assert episode.get_action_size() == action_size assert episode.compute_return() == np.sum(rewards[1:]) # check transitions exported from episode assert len(episode.transitions) == data_size - 1 for i, t in enumerate(episode.transitions): assert isinstance(t, Transition) assert t.observation_shape == (observation_size, ) assert t.action_size == action_size assert np.all(t.observation == observations[i]) assert np.all(t.action == actions[i]) assert t.reward == rewards[i] assert np.all(t.next_observation == observations[i + 1]) assert np.all(t.next_action == actions[i + 1]) assert t.next_reward == rewards[i + 1] assert t.terminal == (1.0 if (i == data_size - 2) else 0.0) assert len(t.returns) == data_size - i - 1 assert len(t.consequent_observations) == data_size - i - 1 # check returns ref_return = 0.0 for j, ret in enumerate(t.returns): print(t.returns) ref_return += (gamma**j) * rewards[i + 1 + j][0] assert ret == ref_return # check list-like bahaviors assert len(episode) == data_size - 1 assert episode[0] is episode.transitions[0] for i, transition in enumerate(episode): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_dynamics_reward_prediction_error_scorer( observation_shape, action_size, n_episodes, episode_length, reward_scaler, ): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape), reward_scaler) total_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) _, pred_reward = dynamics.predict(batch.observations, batch.actions) if reward_scaler: next_rewards = reward_scaler.transform_numpy(batch.next_rewards) else: next_rewards = batch.next_rewards errors = ((next_rewards - pred_reward)**2).reshape(-1) total_errors += errors.tolist() score = dynamics_reward_prediction_error_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_errors))
def test_round_iterator( episode_size, n_episodes, observation_size, action_size, batch_size, shuffle, set_ephemeral, ): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_size, observation_size)) actions = np.random.random((episode_size, action_size)) rewards = np.random.random(episode_size) episode = Episode((observation_size, ), action_size, observations, actions, rewards) episodes.append(episode) iterator = RoundIterator(episodes, batch_size, shuffle=shuffle) if set_ephemeral: iterator.set_ephemeral_transitions(episodes[0].transitions) count = 0 for batch in iterator: assert batch.observations.shape == (batch_size, observation_size) assert batch.actions.shape == (batch_size, action_size) assert batch.rewards.shape == (batch_size, 1) count += 1 if set_ephemeral: assert count == episode_size * (n_episodes + 1) // batch_size assert len(iterator) == episode_size * (n_episodes + 1) // batch_size else: assert count == episode_size * n_episodes // batch_size assert len(iterator) == episode_size * n_episodes // batch_size
def test_transition_minibatch(data_size, observation_shape, action_size, n_frames, discrete_action): if len(observation_shape) == 3: observations = np.random.randint(256, size=(data_size, *observation_shape), dtype=np.uint8) else: observations = np.random.random((data_size, ) + observation_shape).astype('f4') if discrete_action: actions = np.random.randint(action_size, size=data_size) else: actions = np.random.random((data_size, action_size)).astype('f4') rewards = np.random.random((data_size, 1)).astype('f4') episode = Episode(observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards) if len(observation_shape) == 3: n_channels = n_frames * observation_shape[0] image_size = observation_shape[1:] batched_observation_shape = (data_size - 1, n_channels, *image_size) else: batched_observation_shape = (data_size - 1, *observation_shape) # create padded observations for check stacking padding = np.zeros((n_frames - 1, *observation_shape), dtype=np.uint8) padded_observations = np.vstack([padding, observations]) batch = TransitionMiniBatch(episode.transitions, n_frames) assert batch.observations.shape == batched_observation_shape assert batch.next_observations.shape == batched_observation_shape for i, t in enumerate(episode.transitions): observation = batch.observations[i] next_observation = batch.next_observations[i] if n_frames > 1 and len(observation_shape) == 3: # check frame stacking head_index = i tail_index = head_index + n_frames window = padded_observations[head_index:tail_index] next_window = padded_observations[head_index + 1:tail_index + 1] ref_observation = np.vstack(window) ref_next_observation = np.vstack(next_window) assert observation.shape == ref_observation.shape assert next_observation.shape == ref_next_observation.shape assert np.all(observation == ref_observation) assert np.all(next_observation == ref_next_observation) else: assert np.allclose(observation, t.observation) assert np.allclose(next_observation, t.next_observation) assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i][0] == t.reward) assert np.all(batch.next_actions[i] == t.next_action) assert np.all(batch.next_rewards[i][0] == t.next_reward) assert np.all(batch.terminals[i][0] == t.terminal) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_transition_minibatch( data_size, observation_shape, action_size, n_frames, n_steps, gamma, discrete_action, create_mask, mask_size, ): if len(observation_shape) == 3: observations = np.random.randint(256, size=(data_size, *observation_shape), dtype=np.uint8) else: observations = np.random.random((data_size, ) + observation_shape).astype("f4") if discrete_action: actions = np.random.randint(action_size, size=data_size) else: actions = np.random.random((data_size, action_size)).astype("f4") rewards = np.random.random((data_size, 1)).astype("f4") episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards, create_mask=create_mask, mask_size=mask_size, ) if len(observation_shape) == 3: n_channels = n_frames * observation_shape[0] image_size = observation_shape[1:] batched_observation_shape = (data_size - 1, n_channels, *image_size) else: batched_observation_shape = (data_size - 1, *observation_shape) batch = TransitionMiniBatch(episode.transitions, n_frames, n_steps, gamma) assert batch.observations.shape == batched_observation_shape assert batch.next_observations.shape == batched_observation_shape for i, t in enumerate(episode.transitions): observation = batch.observations[i] next_observation = batch.next_observations[i] n = int(batch.n_steps[i][0]) assert n == min(data_size - i - 1, n_steps) if n_frames > 1 and len(observation_shape) == 3: # create padded observations for check stacking pad = ((n_frames - 1, 0), (0, 0), (0, 0), (0, 0)) padded_observations = np.pad(observations, pad, "edge") # check frame stacking head_index = i tail_index = head_index + n_frames window = padded_observations[head_index:tail_index] next_window = padded_observations[head_index + n:tail_index + n] ref_observation = np.vstack(window) ref_next_observation = np.vstack(next_window) assert observation.shape == ref_observation.shape assert next_observation.shape == ref_next_observation.shape assert np.all(observation == ref_observation) assert np.all(next_observation == ref_next_observation) else: next_t = t for _ in range(n - 1): next_t = next_t.next_transition assert np.allclose(observation, t.observation) assert np.allclose(next_observation, next_t.next_observation) next_reward = 0.0 next_action = 0.0 terminal = 0.0 next_t = t for j in range(n): next_reward += next_t.next_reward * gamma**j next_action = next_t.next_action terminal = next_t.terminal next_t = next_t.next_transition assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i][0] == t.reward) assert np.all(batch.next_actions[i] == next_action) assert np.allclose(batch.next_rewards[i][0], next_reward) assert np.all(batch.terminals[i][0] == terminal) # check mask if create_mask: assert batch.masks.shape == (mask_size, data_size - 1, 1) else: assert batch.masks is None # check additional data value = np.random.random(100) batch.add_additional_data("test", value) assert np.all(batch.get_additional_data("test") == value) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_random_iterator( episode_size, n_steps_per_epoch, n_episodes, observation_size, action_size, batch_size, real_ratio, generated_maxlen, ): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_size, observation_size)) actions = np.random.random((episode_size, action_size)) rewards = np.random.random(episode_size) episode = Episode( (observation_size, ), action_size, observations, actions, rewards, terminal=False, ) episodes.append(episode) iterator = RandomIterator( episodes, n_steps_per_epoch, batch_size, real_ratio=real_ratio, generated_maxlen=generated_maxlen, ) # check without generated transitions count = 0 for batch in iterator: assert batch.observations.shape == (batch_size, observation_size) assert batch.actions.shape == (batch_size, action_size) assert batch.rewards.shape == (batch_size, 1) count += 1 assert count == n_steps_per_epoch assert len(iterator) == n_steps_per_epoch # check adding generated transitions transitions = [] for _ in range(episode_size): transition = Transition( (observation_size, ), action_size, np.random.random(observation_size), np.random.random(action_size), np.random.random(), np.random.random(observation_size), np.random.random(action_size), np.random.random(), terminal=True, ) transitions.append(transition) iterator.add_generated_transitions(transitions) assert len(iterator.generated_transitions) == generated_maxlen # check with generated transitions count = 0 for batch in iterator: assert batch.observations.shape == (batch_size, observation_size) assert batch.actions.shape == (batch_size, action_size) assert batch.rewards.shape == (batch_size, 1) assert batch.terminals.sum() == int(batch_size * (1 - real_ratio)) count += 1 assert count == n_steps_per_epoch assert len(iterator) == n_steps_per_epoch