Esempio n. 1
0
    def test_trainer_sarsa(self):
        environment = Gridworld()
        samples = environment.generate_samples(150000, 1.0)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, None)
            evaluator.evaluate(predictor)

        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 2
0
    def test_evaluator_ground_truth(self):
        environment = Gridworld()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, _ = environment.generate_samples(100000, 1.0)
        true_values = environment.true_values_for_sample(
            states, actions, False)
        # Hijack the reward timeline to insert the ground truth
        reward_timelines = []
        for tv in true_values:
            reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        for tdp in tdps:
            trainer.stream_tdp(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.05)
        self.assertLess(evaluator.mc_loss[-1], 0.05)
Esempio n. 3
0
    def test_trainer_sarsa(self):
        environment = Gridworld()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        evaluator = GridworldEvaluator(environment, False)
        trainer = self.get_sarsa_trainer(environment)
        predictor = trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        self.assertGreater(evaluator.evaluate(predictor), 0.15)

        for tdp in tdps:
            trainer.stream_tdp(tdp, None)
        evaluator.evaluate(predictor)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
    def test_reward_boost(self):
        environment = Gridworld()
        reward_boost = {'L': 100, 'R': 200, 'U': 300, 'D': 400}
        trainer = self.get_sarsa_trainer_reward_boost(environment,
                                                      reward_boost)
        predictor = trainer.predictor()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        rewards_update = []
        for action, reward in zip(actions, rewards):
            rewards_update.append(reward - reward_boost[action])
        evaluator = GridworldEvaluator(environment, False)

        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards_update,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        self.assertGreater(evaluator.evaluate(predictor), 0.15)
        for tdp in tdps:
            trainer.train_numpy(tdp, None)

        self.assertLess(evaluator.evaluate(predictor), 0.05)
Esempio n. 5
0
 def generate_samples(self,
                      num_transitions,
                      epsilon,
                      with_possible=True) -> Samples:
     samples = Gridworld.generate_samples(self, num_transitions, epsilon,
                                          with_possible)
     enum_states = []
     for state in samples.states:
         enum_states.append({0: float(list(state.keys())[0])})
     enum_next_states = []
     for state in samples.next_states:
         enum_next_states.append({0: float(list(state.keys())[0])})
     return Samples(
         mdp_ids=samples.mdp_ids,
         sequence_numbers=samples.sequence_numbers,
         states=enum_states,
         actions=samples.actions,
         propensities=samples.propensities,
         rewards=samples.rewards,
         next_states=enum_next_states,
         next_actions=samples.next_actions,
         terminals=samples.terminals,
         possible_next_actions=samples.possible_next_actions,
         reward_timelines=samples.reward_timelines,
     )
Esempio n. 6
0
    def test_reward_boost(self):
        environment = Gridworld()
        reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
        trainer = self.get_sarsa_trainer_reward_boost(environment,
                                                      reward_boost)
        predictor = trainer.predictor()
        samples = environment.generate_samples(150000, 1.0)
        rewards_update = []
        for action, reward in zip(samples.actions, samples.rewards):
            rewards_update.append(reward - reward_boost[action])
        samples.rewards = rewards_update
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 7
0
 def test_gridworld_generate_samples(self):
     env = Gridworld()
     num_samples = 1000
     num_steps = 5
     samples = env.generate_samples(
         num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps
     )
     self._check_samples(samples, num_samples, num_steps, False)
Esempio n. 8
0
    def test_evaluator_timeline(self):
        environment = Gridworld()
        samples = environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(1)

        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Esempio n. 9
0
 def test_sequential_doubly_robust(self):
     """Both the logged and model policies are epsilon-greedy policies where
     greedy = optimal, but the epsilon values are different. We test a variety
     of epsilon pairs to check the estimator's ability to evaluate model policies
     that are much different than the logged policies that generated the data. By
     computing the true values associated with both epsilon policies, we can
     see the performance and compute a percentage error.
     """
     environment = Gridworld()
     sequential_dr = SequentialDoublyRobustEstimator(DISCOUNT)
     epsilon_test_pairs = [
         [1.0, 0.05],
         [0.8, 0.2],
         [0.6, 0.4],
         [0.5, 0.5],
         [0.4, 0.6],
         [0.2, 0.8],
         [0.05, 1.0],
     ]
     for epsilon_pair in epsilon_test_pairs:
         epsilon_logged = epsilon_pair[0]
         epsilon_model = epsilon_pair[1]
         samples_logged = environment.generate_samples(
             10000, epsilon_logged, DISCOUNT
         )
         edp = self.create_edp(environment, samples_logged, epsilon_model)
         cpe_sequential_dr = sequential_dr.estimate(edp)
         true_logged_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_logged
         )
         true_model_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_model
         )
         ratio = true_model_value[0] / true_logged_value[0]
         percent_err = (cpe_sequential_dr.normalized - ratio) / ratio * 100
         logger.info(
             "Sequential DR: epsilon_pair = ("
             + str(epsilon_logged)
             + ", "
             + str(epsilon_model)
             + ");\n"
             + "true ratio = "
             + str(ratio)
             + ", computed ratio = "
             + str(cpe_sequential_dr.normalized)
             + ", percent error = "
             + str(percent_err)
             + "."
         )
         self.assertLessEqual(np.absolute(percent_err), 100)
         self.assertLessEqual(
             cpe_sequential_dr.normalized_std_error, cpe_sequential_dr.normalized
         )
Esempio n. 10
0
 def test_gridworld_generate_samples(self):
     env = Gridworld()
     num_samples = 1000
     num_steps = 5
     samples = env.generate_samples(
         num_samples,
         epsilon=1.0,
         discount_factor=0.9,
         multi_steps=num_steps,
         include_shorter_samples_at_start=True,
         include_shorter_samples_at_end=True,
     )
     self._check_samples(samples, num_samples, num_steps, False)
Esempio n. 11
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(
                gamma=DISCOUNT,
                target_update_rate=0.5,
                reward_burnin=10,
                maxq_learning=True,
            ),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=["linear"],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer="ADAM",
            ),
        )
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters, environment.normalization
        )

        samples = environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        evaluator = GridworldEvaluator(environment, True)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertGreater(evaluator.mc_loss[-1], 0.3)

        for _ in range(5):
            for tdp in tdps:
                maxq_trainer.train_numpy(tdp, None)

        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.reward_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)

        self.assertGreater(
            evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2]
        )
    def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
        environment = Gridworld()
        reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
        trainer = self.get_sarsa_trainer_reward_boost(
            environment,
            reward_boost,
            False,
            use_gpu=use_gpu,
            use_all_avail_gpus=use_all_avail_gpus,
        )
        predictor = trainer.predictor()
        samples = environment.generate_samples(100000, 1.0, DISCOUNT)
        rewards_update = []
        for action, reward in zip(samples.actions, samples.rewards):
            rewards_update.append(reward - reward_boost[action])
        samples.rewards = rewards_update
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              use_gpu=use_gpu)

        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for tdp in tdps:
            trainer.train(tdp, None)

        predictor = trainer.predictor()
        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 13
0
    def test_evaluator_ground_truth_no_dueling(self):
        environment = Gridworld()
        samples = environment.generate_samples(500000, 1.0, DISCOUNT)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.episode_values = true_values
        trainer = self.get_sarsa_trainer(environment, False)
        evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 14
0
    def test_trainer_maxq(self):
        environment = Gridworld()
        maxq_sarsa_parameters = DiscreteActionModelParameters(
            actions=environment.ACTIONS,
            rl=RLParameters(gamma=DISCOUNT,
                            target_update_rate=0.5,
                            reward_burnin=10,
                            maxq_learning=True),
            training=TrainingParameters(
                layers=[-1, 1],
                activations=['linear'],
                minibatch_size=self.minibatch_size,
                learning_rate=0.01,
                optimizer='ADAM',
            ))
        # construct the new trainer that using maxq
        maxq_trainer = DiscreteActionTrainer(
            maxq_sarsa_parameters,
            environment.normalization,
        )
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        predictor = maxq_trainer.predictor()
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        evaluator = GridworldEvaluator(environment, True)
        print("Pre-Training eval", evaluator.evaluate(predictor))
        self.assertGreater(evaluator.evaluate(predictor), 0.3)

        for _ in range(2):
            for tdp in tdps:
                maxq_trainer.stream_tdp(tdp, None)
            evaluator.evaluate(predictor)

        print("Post-Training eval", evaluator.evaluate(predictor))
        self.assertLess(evaluator.evaluate(predictor), 0.1)
Esempio n. 15
0
    def test_evaluator_ground_truth(self):
        environment = Gridworld()
        samples = environment.generate_samples(200000, 1.0)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.reward_timelines = []
        for tv in true_values:
            samples.reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 16
0
 def generate_samples(
     self,
     num_transitions,
     epsilon,
     with_possible=True
 ) -> Tuple[List[Dict[int, float]], List[str], List[float], List[Dict[
         int, float]], List[str], List[bool], List[List[str]], List[Dict[
             int, float]]]:
     states, actions, rewards, next_states, next_actions, is_terminals, \
         possible_next_actions, reward_timelines = Gridworld.generate_samples(
             self, num_transitions, epsilon, with_possible)
     enum_states = []
     for state in states:
         enum_states.append({0: float(list(state.keys())[0])})
     enum_next_states = []
     for state in next_states:
         enum_next_states.append({0: float(list(state.keys())[0])})
     return (enum_states, actions, rewards, enum_next_states, next_actions,
             is_terminals, possible_next_actions, reward_timelines)
Esempio n. 17
0
    def test_evaluator_timeline(self):
        environment = Gridworld()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)

        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        for tdp in tdps:
            trainer.stream_tdp(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Esempio n. 18
0
    def test_knn_dqn_trainer(self):
        environment = Gridworld()
        samples = environment.generate_samples(200000, 1.0, DISCOUNT)
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        parameters = self.get_parameters(environment)
        trainer = KNNDQNTrainer(parameters, environment.normalization)

        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              one_hot_action=False)

        predictor = trainer.predictor(environment.ACTIONS)

        evaluator.evaluate(predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        pre_train_loss = evaluator.mc_loss[-1]

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        predictor = trainer.predictor(environment.ACTIONS)
        evaluator.evaluate(predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], pre_train_loss)
Esempio n. 19
0
 def generate_samples(self, num_transitions, epsilon,
                      discount_factor) -> Samples:
     samples = Gridworld.generate_samples(self, num_transitions, epsilon,
                                          discount_factor)
     enum_states = []
     for state in samples.states:
         enum_states.append({0: float(list(state.keys())[0])})
     enum_next_states = []
     for state in samples.next_states:
         enum_next_states.append({0: float(list(state.keys())[0])})
     return Samples(
         mdp_ids=samples.mdp_ids,
         sequence_numbers=samples.sequence_numbers,
         states=enum_states,
         actions=samples.actions,
         propensities=samples.propensities,
         rewards=samples.rewards,
         possible_actions=samples.possible_actions,
         next_states=enum_next_states,
         next_actions=samples.next_actions,
         terminals=samples.terminals,
         possible_next_actions=samples.possible_next_actions,
         episode_values=samples.episode_values,
     )
Esempio n. 20
0
    def test_gridworld_generate_samples(self):
        env = Gridworld()
        num_samples = 1000
        num_steps = 5
        samples = env.generate_samples(num_samples,
                                       epsilon=1.0,
                                       discount_factor=0.9,
                                       multi_steps=num_steps)
        for i in range(num_samples):
            if samples.terminals[i][0]:
                break
            if i < num_samples - 1:
                self.assertEqual(samples.mdp_ids[i], samples.mdp_ids[i + 1])
                self.assertEqual(samples.sequence_numbers[i] + 1,
                                 samples.sequence_numbers[i + 1])
            for j in range(len(samples.terminals[i])):
                self.assertEqual(samples.rewards[i][j],
                                 samples.rewards[i + j][0])
                self.assertDictEqual(samples.next_states[i][j],
                                     samples.next_states[i + j][0])
                self.assertEqual(samples.next_actions[i][j],
                                 samples.next_actions[i + j][0])
                self.assertEqual(samples.terminals[i][j],
                                 samples.terminals[i + j][0])
                self.assertListEqual(
                    samples.possible_next_actions[i][j],
                    samples.possible_next_actions[i + j][0],
                )
                if samples.terminals[i][j]:
                    continue
                self.assertDictEqual(samples.next_states[i][j],
                                     samples.states[i + j + 1])
                self.assertEqual(samples.next_actions[i][j],
                                 samples.actions[i + j + 1])
                self.assertListEqual(
                    samples.possible_next_actions[i][j],
                    samples.possible_actions[i + j + 1],
                )

        single_step_samples = samples.to_single_step()
        for i in range(num_samples):
            if single_step_samples.terminals[i] is True:
                break
            self.assertEqual(single_step_samples.mdp_ids[i],
                             samples.mdp_ids[i])
            self.assertEqual(single_step_samples.sequence_numbers[i],
                             samples.sequence_numbers[i])
            self.assertDictEqual(single_step_samples.states[i],
                                 samples.states[i])
            self.assertEqual(single_step_samples.actions[i],
                             samples.actions[i])
            self.assertEqual(
                single_step_samples.action_probabilities[i],
                samples.action_probabilities[i],
            )
            self.assertEqual(single_step_samples.rewards[i],
                             samples.rewards[i][0])
            self.assertListEqual(single_step_samples.possible_actions[i],
                                 samples.possible_actions[i])
            self.assertDictEqual(single_step_samples.next_states[i],
                                 samples.next_states[i][0])
            self.assertEqual(single_step_samples.next_actions[i],
                             samples.next_actions[i][0])
            self.assertEqual(single_step_samples.terminals[i],
                             samples.terminals[i][0])
            self.assertListEqual(
                single_step_samples.possible_next_actions[i],
                samples.possible_next_actions[i][0],
            )