def test_evaluator_ground_truth(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, _ = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( states, actions, False) # Hijack the reward timeline to insert the ground truth reward_timelines = [] for tv in true_values: reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.05)
def test_trainer_sarsa(self): environment = Gridworld() samples = environment.generate_samples(100000, 1.0) evaluator = GridworldEvaluator(environment, False) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.05) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def test_reward_boost(self): environment = Gridworld() reward_boost = {'L': 100, 'R': 200, 'U': 300, 'D': 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) rewards_update = [] for action, reward in zip(actions, rewards): rewards_update.append(reward - reward_boost[action]) evaluator = GridworldEvaluator(environment, False) tdps = environment.preprocess_samples( states, actions, rewards_update, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) self.assertLess(evaluator.evaluate(predictor), 0.05)
def test_doubly_robust(self): """Both the logged and model policies are epsilon-greedy policies where greedy = optimal, but the epsilon values are different. We test a variety of epsilon pairs to check the estimator's ability to evaluate model policies that are much different than the logged policies that generated the data. By computing the true values associated with both epsilon policies, we can see the performance and compute a percentage error. """ environment = Gridworld() dr = DoublyRobustEstimator() epsilon_test_pairs = [ [1.0, 0.05], [0.8, 0.2], [0.6, 0.4], [0.5, 0.5], [0.4, 0.6], [0.2, 0.8], [0.05, 1.0], ] for epsilon_pair in epsilon_test_pairs: epsilon_logged = epsilon_pair[0] epsilon_model = epsilon_pair[1] samples_logged = environment.generate_samples( 10000, epsilon_logged, DISCOUNT ) edp = self.create_edp(environment, samples_logged, epsilon_model) cpe_drs = dr.estimate(edp) true_logged_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_logged ) true_model_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_model ) ratio = true_model_value[0] / true_logged_value[0] cpe_drs_names = [ "One-step direct method", "One-step inverse propensity", "One-step doubly robust", ] for i in range(len(cpe_drs)): percent_err = (cpe_drs[i].normalized - ratio) / ratio * 100 logger.info( cpe_drs_names[i] + ": epsilon_pair = (" + str(epsilon_logged) + ", " + str(epsilon_model) + ");\n" + "true ratio = " + str(ratio) + ", computed ratio = " + str(cpe_drs[i].normalized) + ", percent error = " + str(percent_err) + "." ) self.assertLessEqual(np.absolute(percent_err), 1000) self.assertLessEqual( cpe_drs[i].normalized_std_error, cpe_drs[i].normalized )
def test_predictor_export(self): """Verify that q-values before model export equal q-values after model export. Meant to catch issues with export logic.""" environment = Gridworld() trainer = trainer = self.get_sarsa_trainer(environment, False) samples = Samples( mdp_ids=["0"], sequence_numbers=[0], states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}], actions=["D"], action_probabilities=[0.5], rewards=[0], possible_actions=[["R", "D"]], next_states=[{5: 1.0}], next_actions=["U"], terminals=[False], possible_next_actions=[["R", "U", "D"]], ) tdps = environment.preprocess_samples(samples, 1) pre_export_q_values = trainer.q_network(tdps[0].states).detach().numpy() predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) post_export_q_values = new_predictor.predict([samples.states[0]]) for i, action in enumerate(environment.ACTIONS): self.assertAlmostEquals( pre_export_q_values[0][i], post_export_q_values[0][action], places=4 )
def test_knn_dqn_trainer(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) parameters = self.get_parameters(environment) trainer = KNNDQNTrainer(parameters, environment.normalization) tdps = environment.preprocess_samples( samples, self.minibatch_size, one_hot_action=False ) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) pre_train_loss = evaluator.mc_loss[-1] for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], pre_train_loss)
def test_trainer_sarsa(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) evaluator = GridworldEvaluator(environment, False) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def test_reward_boost(self): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() samples = environment.generate_samples(150000, 1.0) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps ) self._check_samples(samples, num_samples, num_steps, False)
def _test_evaluator_ground_truth_no_dueling( self, use_gpu=False, use_all_avail_gpus=False ): environment = Gridworld() trainer = self.get_sarsa_trainer( environment, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus ) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False) self.evaluate_gridworld(environment, evaluator, trainer, trainer, use_gpu)
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, reward_boost, False, use_gpu, use_all_avail_gpus ) evaluator = GridworldEvaluator( env=environment, assume_optimal_policy=False, gamma=DISCOUNT ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_evaluator_timeline(self): environment = Gridworld() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(1) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_magic(self): """Both the logged and model policies are epsilon-greedy policies where greedy = optimal, but the epsilon values are different. We test a variety of epsilon pairs to check the estimator's ability to evaluate model policies that are much different than the logged policies that generated the data. By computing the true values associated with both epsilon policies, we can see the performance and compute a percentage error. """ environment = Gridworld() weighted_sequential_dr = WeightedSequentialDoublyRobustEstimator(DISCOUNT) epsilon_test_pairs = [ [1.0, 0.05], [0.8, 0.2], [0.6, 0.4], [0.5, 0.5], [0.4, 0.6], [0.2, 0.8], [0.05, 1.0], ] for epsilon_pair in epsilon_test_pairs: epsilon_logged = epsilon_pair[0] epsilon_model = epsilon_pair[1] samples_logged = environment.generate_samples( 10000, epsilon_logged, DISCOUNT ) edp = self.create_edp(environment, samples_logged, epsilon_model) cpe_magic = weighted_sequential_dr.estimate( edp, TestGridworldCPE.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, True ) true_logged_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_logged ) true_model_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_model ) ratio = true_model_value[0] / true_logged_value[0] percent_err = (cpe_magic.normalized - ratio) / ratio * 100 logger.info( "Magic: epsilon_pair = (" + str(epsilon_logged) + ", " + str(epsilon_model) + ");\n" + "true ratio = " + str(ratio) + ", computed ratio = " + str(cpe_magic.normalized) + ", percent error = " + str(percent_err) + "." ) self.assertLessEqual(np.absolute(percent_err), 100) self.assertLessEqual(cpe_magic.normalized_std_error, cpe_magic.normalized)
def test_predictor_torch_export(self): """Verify that q-values before model export equal q-values after model export. Meant to catch issues with export logic.""" environment = Gridworld() samples = Samples( mdp_ids=["0"], sequence_numbers=[0], sequence_number_ordinals=[1], states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}], actions=["D"], action_probabilities=[0.5], rewards=[0], possible_actions=[["R", "D"]], next_states=[{5: 1.0}], next_actions=["U"], terminals=[False], possible_next_actions=[["R", "U", "D"]], ) tdps = environment.preprocess_samples(samples, 1) assert len(tdps) == 1, "Invalid number of data pages" trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, {}, False ) input = rlt.PreprocessedState.from_tensor(tdps[0].states) pre_export_q_values = trainer.q_network(input).q_values.detach().numpy() preprocessor = Preprocessor(environment.normalization, False) cpu_q_network = trainer.q_network.cpu_model() cpu_q_network.eval() dqn_with_preprocessor = DiscreteDqnWithPreprocessor(cpu_q_network, preprocessor) serving_module = DiscreteDqnPredictorWrapper( dqn_with_preprocessor, action_names=environment.ACTIONS ) with tempfile.TemporaryDirectory() as tmpdirname: buf = export_module_to_buffer(serving_module) tmp_path = os.path.join(tmpdirname, "model") with open(tmp_path, "wb") as f: f.write(buf.getvalue()) f.close() predictor = DiscreteDqnTorchPredictor(torch.jit.load(tmp_path)) post_export_q_values = predictor.predict([samples.states[0]]) for i, action in enumerate(environment.ACTIONS): self.assertAlmostEqual( float(pre_export_q_values[0][i]), float(post_export_q_values[0][action]), places=4, )
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps, include_shorter_samples_at_start=True, include_shorter_samples_at_end=True, ) self._check_samples(samples, num_samples, num_steps, False)
def _test_evaluator_ground_truth( self, dueling=False, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): environment = Gridworld() evaluator = GridworldEvaluator(environment, False, DISCOUNT) trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, {}, dueling, use_gpu, use_all_avail_gpus, clip_grad_norm ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost( environment, reward_boost, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) predictor = trainer.predictor() samples = environment.generate_samples(100000, 1.0, DISCOUNT) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for tdp in tdps: trainer.train(tdp, None) predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_evaluator_ground_truth_no_dueling(self): environment = Gridworld() samples = environment.generate_samples(500000, 1.0, DISCOUNT) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.episode_values = true_values trainer = self.get_sarsa_trainer(environment, False) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_trainer( environment, reward_boost, dueling=False, categorical=False, quantile=False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) evaluator = GridworldEvaluator(env=environment, assume_optimal_policy=False, gamma=DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)
def test_evaluator_ground_truth(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_sarsa_layer_validation(self): env = Gridworld() invalid_sarsa_params = DiscreteActionModelParameters( actions=env.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=False), training=TrainingParameters( layers=[-1, 3], activations=['linear'], minibatch_size=32, learning_rate=0.1, optimizer='SGD', )) with self.assertRaises(Exception): # layers[-1] should be 1 DiscreteActionTrainer(env.normalization, invalid_sarsa_params)
def _test_evaluator_ground_truth( self, dueling=False, categorical=False, quantile=False, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): environment = Gridworld() evaluator = GridworldEvaluator(environment, False, DISCOUNT) trainer = self.get_trainer( environment, {}, dueling=dueling, categorical=categorical, quantile=quantile, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, clip_grad_norm=clip_grad_norm, ) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
def test_evaluator_timeline(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def envs(): return [(Gridworld(), ), (GridworldEnum(), )]
def envs_and_evaluators(): return [ (Gridworld(), GridworldEvaluator), (GridworldEnum(), GridworldEnumEvaluator), ]
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples(num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps) for i in range(num_samples): if samples.terminals[i][0]: break if i < num_samples - 1: self.assertEqual(samples.mdp_ids[i], samples.mdp_ids[i + 1]) self.assertEqual(samples.sequence_numbers[i] + 1, samples.sequence_numbers[i + 1]) for j in range(len(samples.terminals[i])): self.assertEqual(samples.rewards[i][j], samples.rewards[i + j][0]) self.assertDictEqual(samples.next_states[i][j], samples.next_states[i + j][0]) self.assertEqual(samples.next_actions[i][j], samples.next_actions[i + j][0]) self.assertEqual(samples.terminals[i][j], samples.terminals[i + j][0]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_next_actions[i + j][0], ) if samples.terminals[i][j]: continue self.assertDictEqual(samples.next_states[i][j], samples.states[i + j + 1]) self.assertEqual(samples.next_actions[i][j], samples.actions[i + j + 1]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_actions[i + j + 1], ) single_step_samples = samples.to_single_step() for i in range(num_samples): if single_step_samples.terminals[i] is True: break self.assertEqual(single_step_samples.mdp_ids[i], samples.mdp_ids[i]) self.assertEqual(single_step_samples.sequence_numbers[i], samples.sequence_numbers[i]) self.assertDictEqual(single_step_samples.states[i], samples.states[i]) self.assertEqual(single_step_samples.actions[i], samples.actions[i]) self.assertEqual( single_step_samples.action_probabilities[i], samples.action_probabilities[i], ) self.assertEqual(single_step_samples.rewards[i], samples.rewards[i][0]) self.assertListEqual(single_step_samples.possible_actions[i], samples.possible_actions[i]) self.assertDictEqual(single_step_samples.next_states[i], samples.next_states[i][0]) self.assertEqual(single_step_samples.next_actions[i], samples.next_actions[i][0]) self.assertEqual(single_step_samples.terminals[i], samples.terminals[i][0]) self.assertListEqual( single_step_samples.possible_next_actions[i], samples.possible_next_actions[i][0], )