def _test_sac_trainer(self, use_2_q_functions=False, use_gpu=False): environment = GridworldContinuous() trainer = self.get_sac_trainer( environment, self.get_sac_parameters(use_2_q_functions), use_gpu) evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT, use_int_features=False, ) exporter = self.get_critic_exporter(trainer, environment) self.tolerance_threshold = 0.2 if use_gpu: self.run_pre_training_eval = False self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) # Just test that it doesn't blow up preds = actor_predictor.predict(evaluator.logged_states, None) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def test_trainer_single_batch_maxq(self, environment): rl_parameters = self.get_sarsa_parameters() new_rl_parameters = ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=rl_parameters.training, knn=rl_parameters.knn) maxq_trainer = ContinuousActionDQNTrainer( environment.normalization, environment.normalization_action, new_rl_parameters) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tbp = environment.preprocess_samples(states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines) evaluator = GridworldContinuousEvaluator(environment, True) self.assertGreater(evaluator.evaluate(predictor), 0.4) for _ in range(2): maxq_trainer.stream_tdp(tbp) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.1)
def test_trainer_maxq(self): environment = GridworldContinuous() rl_parameters = self.get_sarsa_parameters() new_rl_parameters = ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=rl_parameters.training, knn=rl_parameters.knn, ) maxq_trainer = ContinuousActionDQNTrainer( new_rl_parameters, environment.normalization, environment.normalization_action, ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldContinuousEvaluator(environment, True) self.assertGreater(evaluator.evaluate(predictor), 0.2) for _ in range(2): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False, modular=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT, use_int_features=False, ) if modular: # FIXME: the exporter should make a copy of the model; moving it to CPU inplace if use_gpu: self.run_pre_training_eval = False if use_all_avail_gpus: self.tolerance_threshold = 0.11 trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus) else: trainer, exporter = self.get_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_trainer_sarsa_enum(self): environment = GridworldContinuousEnum() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) trainer = self.get_trainer(environment, None, use_gpu, use_all_avail_gpus) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
def _test_trainer_sarsa_factorized(self, use_gpu=False, use_all_avail_gpus=False): self.check_tolerance = False self.tolerance_threshold = 0.15 environment = GridworldContinuous() trainer, exporter = self.get_sarsa_trainer_exporter( environment, self.get_sarsa_parameters_factorized(), use_gpu, use_all_avail_gpus, ) evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) if use_all_avail_gpus: self.tolerance_threshold = 0.11 trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_trainer_sarsa_enum(self): environment = GridworldContinuousEnum() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def _test_sac_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_sac_trainer(environment, use_gpu, **kwargs) evaluator = GridworldContinuousEvaluator(environment, assume_optimal_policy=False, gamma=DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) # Just test that it doesn't blow up preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def test_trainer_sarsa_enum_factorized(self): environment = GridworldContinuousEnum() samples = environment.generate_samples(500000, 1.0, DISCOUNT) trainer = self.get_sarsa_trainer( environment, self.get_sarsa_parameters_factorized()) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def test_trainer_sarsa_enum(self): environment = GridworldContinuousEnum() samples = environment.generate_samples(150000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def test_trainer_single_batch_sarsa(self, environment): states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False) tdp = environment.preprocess_samples(states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines) self.assertGreater(evaluator.evaluate(predictor), 0.15) trainer.stream_tdp(tdp) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def _test_td3_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_td3_trainer( environment, self.get_td3_parameters(**kwargs), use_gpu ) evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) self.current_predictor_network = trainer.q1_network self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) if trainer.q2_network is not None: self.current_predictor_network = trainer.q2_network self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def _test_td3_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_td3_trainer(environment, self.get_td3_parameters(**kwargs), use_gpu) evaluator = GridworldContinuousEvaluator(environment, assume_optimal_policy=False, gamma=DISCOUNT) exporter1, exporter2 = self.get_critic_exporter(trainer, environment) self.evaluate_gridworld(environment, evaluator, trainer, exporter1, use_gpu) if exporter2: self.evaluate_gridworld(environment, evaluator, trainer, exporter2, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)