def test_reward_boost(self): environment = Gridworld() reward_boost = {'L': 100, 'R': 200, 'U': 300, 'D': 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) rewards_update = [] for action, reward in zip(actions, rewards): rewards_update.append(reward - reward_boost[action]) evaluator = GridworldEvaluator(environment, False) tdps = environment.preprocess_samples( states, actions, rewards_update, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) self.assertLess(evaluator.evaluate(predictor), 0.05)
def _test_evaluator_ground_truth_no_dueling( self, use_gpu=False, use_all_avail_gpus=False ): environment = Gridworld() trainer = self.get_sarsa_trainer( environment, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus ) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False) self.evaluate_gridworld(environment, evaluator, trainer, trainer, use_gpu)
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, reward_boost, False, use_gpu, use_all_avail_gpus ) evaluator = GridworldEvaluator( env=environment, assume_optimal_policy=False, gamma=DISCOUNT ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_trainer_sarsa(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) evaluator = GridworldEvaluator(environment, False) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def test_trainer_sarsa_enum(self): environment = GridworldEnum() samples = environment.generate_samples(500000, 1.0) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.12) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def _test_trainer_sarsa_enum(self, use_gpu=False, use_all_avail_gpus=False): environment = GridworldEnum() samples = environment.generate_samples(100000, 1.0, DISCOUNT) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) trainer = self.get_sarsa_trainer(environment, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.12) for tdp in tdps: trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_reward_boost(self): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() samples = environment.generate_samples(150000, 1.0) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_trainer_sarsa(self): environment = Gridworld() samples = environment.generate_samples(150000, 1.0) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_knn_dqn_trainer(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) parameters = self.get_parameters(environment) trainer = KNNDQNTrainer(parameters, environment.normalization) tdps = environment.preprocess_samples( samples, self.minibatch_size, one_hot_action=False ) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) pre_train_loss = evaluator.mc_loss[-1] for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], pre_train_loss)
def test_trainer_sarsa_enum(self): environment = GridworldEnum() samples = environment.generate_samples(100000, 1.0) evaluator = GridworldEvaluator(environment, False) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.05) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def _test_evaluator_ground_truth( self, dueling=False, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): environment = Gridworld() evaluator = GridworldEvaluator(environment, False, DISCOUNT) trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, {}, dueling, use_gpu, use_all_avail_gpus, clip_grad_norm ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_trainer( environment, reward_boost, dueling=False, categorical=False, quantile=False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) evaluator = GridworldEvaluator(env=environment, assume_optimal_policy=False, gamma=DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost( environment, reward_boost, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) predictor = trainer.predictor() samples = environment.generate_samples(100000, 1.0, DISCOUNT) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for tdp in tdps: trainer.train(tdp, None) predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_trainer_many_batch_sarsa(self, environment): states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldEvaluator(environment, False) tdp = environment.preprocess_samples(states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.15) for i in range(0, tdp.size(), 100): trainer.stream_tdp(tdp.get_sub_page(i, i + 100), None) print("Post-Training eval", evaluator.evaluate(predictor)) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def _test_evaluator_ground_truth( self, dueling=False, categorical=False, quantile=False, use_gpu=False, use_all_avail_gpus=False, clip_grad_norm=None, ): environment = Gridworld() evaluator = GridworldEvaluator(environment, False, DISCOUNT) trainer = self.get_trainer( environment, {}, dueling=dueling, categorical=categorical, quantile=quantile, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, clip_grad_norm=clip_grad_norm, ) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)