def test_reward_boost(self):
        environment = Gridworld()
        reward_boost = {'L': 100, 'R': 200, 'U': 300, 'D': 400}
        trainer = self.get_sarsa_trainer_reward_boost(environment,
                                                      reward_boost)
        predictor = trainer.predictor()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        rewards_update = []
        for action, reward in zip(actions, rewards):
            rewards_update.append(reward - reward_boost[action])
        evaluator = GridworldEvaluator(environment, False)

        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards_update,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        self.assertGreater(evaluator.evaluate(predictor), 0.15)
        for tdp in tdps:
            trainer.train_numpy(tdp, None)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
 def _test_evaluator_ground_truth_no_dueling(
     self, use_gpu=False, use_all_avail_gpus=False
 ):
     environment = Gridworld()
     trainer = self.get_sarsa_trainer(
         environment, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus
     )
     evaluator = GridworldEvaluator(environment, False, DISCOUNT, False)
     self.evaluate_gridworld(environment, evaluator, trainer, trainer, use_gpu)
Example #3
0
 def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
     environment = Gridworld()
     reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
     trainer, exporter = self.get_modular_sarsa_trainer_exporter(
         environment, reward_boost, False, use_gpu, use_all_avail_gpus
     )
     evaluator = GridworldEvaluator(
         env=environment, assume_optimal_policy=False, gamma=DISCOUNT
     )
     self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
Example #4
0
    def test_trainer_sarsa(self):
        environment = Gridworld()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        evaluator = GridworldEvaluator(environment, False)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        self.assertGreater(evaluator.evaluate(predictor), 0.15)

        for tdp in tdps:
            trainer.stream_tdp(tdp, None)
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
Example #5
0
    def test_trainer_sarsa_enum(self):
        environment = GridworldEnum()
        samples = environment.generate_samples(500000, 1.0)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.12)

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        predictor = trainer.predictor()
        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
    def _test_trainer_sarsa_enum(self,
                                 use_gpu=False,
                                 use_all_avail_gpus=False):
        environment = GridworldEnum()
        samples = environment.generate_samples(100000, 1.0, DISCOUNT)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)
        trainer = self.get_sarsa_trainer(environment,
                                         False,
                                         use_gpu=use_gpu,
                                         use_all_avail_gpus=use_all_avail_gpus)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              use_gpu=use_gpu)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.12)

        for tdp in tdps:
            trainer.train(tdp)

        predictor = trainer.predictor()
        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Example #7
0
    def test_reward_boost(self):
        environment = Gridworld()
        reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
        trainer = self.get_sarsa_trainer_reward_boost(environment,
                                                      reward_boost)
        predictor = trainer.predictor()
        samples = environment.generate_samples(150000, 1.0)
        rewards_update = []
        for action, reward in zip(samples.actions, samples.rewards):
            rewards_update.append(reward - reward_boost[action])
        samples.rewards = rewards_update
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Example #8
0
    def test_trainer_sarsa(self):
        environment = Gridworld()
        samples = environment.generate_samples(150000, 1.0)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, None)
            evaluator.evaluate(predictor)

        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Example #9
0
    def test_knn_dqn_trainer(self):
        environment = Gridworld()
        samples = environment.generate_samples(200000, 1.0)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples)

        parameters = self.get_parameters(environment)
        trainer = KNNDQNTrainer(parameters, environment.normalization)

        tdps = environment.preprocess_samples(
            samples, self.minibatch_size, one_hot_action=False
        )

        predictor = trainer.predictor(environment.ACTIONS)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        pre_train_loss = evaluator.mc_loss[-1]

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        predictor = trainer.predictor(environment.ACTIONS)
        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], pre_train_loss)
Example #10
0
    def test_trainer_sarsa_enum(self):
        environment = GridworldEnum()
        samples = environment.generate_samples(100000, 1.0)
        evaluator = GridworldEvaluator(environment, False)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.15)

        for tdp in tdps:
            trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.05)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
Example #11
0
 def _test_evaluator_ground_truth(
     self,
     dueling=False,
     use_gpu=False,
     use_all_avail_gpus=False,
     clip_grad_norm=None,
 ):
     environment = Gridworld()
     evaluator = GridworldEvaluator(environment, False, DISCOUNT)
     trainer, exporter = self.get_modular_sarsa_trainer_exporter(
         environment, {}, dueling, use_gpu, use_all_avail_gpus, clip_grad_norm
     )
     self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
Example #12
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(gamma=DISCOUNT,
                            target_update_rate=0.5,
                            reward_burnin=10,
                            maxq_learning=True),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            ))
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters,
            environment.normalization,
        )
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldEvaluator(environment, True)
        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.3)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.stream_tdp(tdp, None)
            evaluator.evaluate(predictor)

        print("Post-Training eval", evaluator.evaluate(predictor))
        self.assertLess(evaluator.evaluate(predictor), 0.1)
 def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
     environment = Gridworld()
     reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
     trainer = self.get_trainer(
         environment,
         reward_boost,
         dueling=False,
         categorical=False,
         quantile=False,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
     evaluator = GridworldEvaluator(env=environment,
                                    assume_optimal_policy=False,
                                    gamma=DISCOUNT)
     self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
Example #14
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=["linear"],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer="ADAM",
            ),
        )
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters, environment.normalization
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldEvaluator(environment, True)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.3)

        for _ in range(5):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
    def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
        environment = Gridworld()
        reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
        trainer = self.get_sarsa_trainer_reward_boost(
            environment,
            reward_boost,
            False,
            use_gpu=use_gpu,
            use_all_avail_gpus=use_all_avail_gpus,
        )
        predictor = trainer.predictor()
        samples = environment.generate_samples(100000, 1.0, DISCOUNT)
        rewards_update = []
        for action, reward in zip(samples.actions, samples.rewards):
            rewards_update.append(reward - reward_boost[action])
        samples.rewards = rewards_update
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              use_gpu=use_gpu)

        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for tdp in tdps:
            trainer.train(tdp, None)

        predictor = trainer.predictor()
        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
    def test_trainer_many_batch_sarsa(self, environment):
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        evaluator = GridworldEvaluator(environment, False)
        tdp = environment.preprocess_samples(states, actions, rewards,
                                             next_states, next_actions,
                                             is_terminal,
                                             possible_next_actions,
                                             reward_timelines)

        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.15)

        for i in range(0, tdp.size(), 100):
            trainer.stream_tdp(tdp.get_sub_page(i, i + 100), None)

        print("Post-Training eval", evaluator.evaluate(predictor))
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
 def _test_evaluator_ground_truth(
     self,
     dueling=False,
     categorical=False,
     quantile=False,
     use_gpu=False,
     use_all_avail_gpus=False,
     clip_grad_norm=None,
 ):
     environment = Gridworld()
     evaluator = GridworldEvaluator(environment, False, DISCOUNT)
     trainer = self.get_trainer(
         environment,
         {},
         dueling=dueling,
         categorical=categorical,
         quantile=quantile,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
         clip_grad_norm=clip_grad_norm,
     )
     self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)