def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
        environment = Gridworld()
        reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
        trainer = self.get_sarsa_trainer_reward_boost(
            environment,
            reward_boost,
            False,
            use_gpu=use_gpu,
            use_all_avail_gpus=use_all_avail_gpus,
        )
        predictor = trainer.predictor()
        samples = environment.generate_samples(100000, 1.0, DISCOUNT)
        rewards_update = []
        for action, reward in zip(samples.actions, samples.rewards):
            rewards_update.append(reward - reward_boost[action])
        samples.rewards = rewards_update
        evaluator = GridworldEvaluator(environment, False, DISCOUNT, False,
                                       samples)

        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              use_gpu=use_gpu)

        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Pre-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )

        for tdp in tdps:
            trainer.train(tdp, None)

        predictor = trainer.predictor()
        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)
            evaluator.evaluate(new_predictor)
        print(
            "Post-Training eval: ",
            evaluator.mc_loss[-1],
            evaluator.value_doubly_robust[-1],
        )
        self.assertLess(evaluator.mc_loss[-1], 0.1)
    def test_predictor_export(self):
        """Verify that q-values before model export equal q-values after
        model export. Meant to catch issues with export logic."""
        environment = Gridworld()
        trainer = trainer = self.get_sarsa_trainer(environment, False)

        samples = Samples(
            mdp_ids=["0"],
            sequence_numbers=[0],
            states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}],
            actions=["D"],
            action_probabilities=[0.5],
            rewards=[0],
            possible_actions=[["R", "D"]],
            next_states=[{5: 1.0}],
            next_actions=["U"],
            terminals=[False],
            possible_next_actions=[["R", "U", "D"]],
        )
        tdps = environment.preprocess_samples(samples, 1)

        pre_export_q_values = trainer.q_network(tdps[0].states).detach().numpy()

        predictor = trainer.predictor()
        with tempfile.TemporaryDirectory() as tmpdirname:
            tmp_path = os.path.join(tmpdirname, "model")
            predictor.save(tmp_path, "minidb")
            new_predictor = DQNPredictor.load(tmp_path, "minidb", False)

        post_export_q_values = new_predictor.predict([samples.states[0]])

        for i, action in enumerate(environment.ACTIONS):
            self.assertAlmostEquals(
                pre_export_q_values[0][i], post_export_q_values[0][action], places=4
            )
Esempio n. 3
0
 def _test_dqn_workflow(self, use_gpu=False, use_all_avail_gpus=False):
     """Run DQN workflow to ensure no crashes, algorithm correctness
     not tested here."""
     with tempfile.TemporaryDirectory() as tmpdirname:
         lockfile = os.path.join(tmpdirname, "multiprocess_lock")
         Path(lockfile).touch()
         params = {
             "training_data_path": os.path.join(
                 curr_dir, "test_data/discrete_action/cartpole_training.json.bz2"
             ),
             "eval_data_path": os.path.join(
                 curr_dir, "test_data/discrete_action/cartpole_eval.json.bz2"
             ),
             "state_norm_data_path": os.path.join(
                 curr_dir, "test_data/discrete_action/cartpole_norm.json"
             ),
             "model_output_path": tmpdirname,
             "use_gpu": use_gpu,
             "use_all_avail_gpus": use_all_avail_gpus,
             "init_method": "file://" + lockfile,
             "num_nodes": 1,
             "node_index": 0,
             "actions": ["0", "1"],
             "epochs": 1,
             "rl": {},
             "rainbow": {"double_q_learning": False, "dueling_architecture": False},
             "training": {"minibatch_size": 128},
         }
         dqn_workflow.main(params)
         predictor_files = glob.glob(tmpdirname + "/predictor_*.c2")
         assert len(predictor_files) == 1, "Somehow created two predictor files!"
         predictor = DQNPredictor.load(predictor_files[0], "minidb")
         test_float_state_features = [{"0": 1.0, "1": 1.0, "2": 1.0, "3": 1.0}]
         q_values = predictor.predict(test_float_state_features)
     assert len(q_values[0].keys()) == 2
Esempio n. 4
0
 def test_read_c2_model_from_file(self):
     """Test reading output caffe2 model from file and using it for inference."""
     path = os.path.join(curr_dir, "test_data/discrete_action/example_predictor.c2")
     predictor = DQNPredictor.load(path, "minidb")
     test_float_state_features = [{"0": 1.0, "1": 1.0, "2": 1.0, "3": 1.0}]
     q_values = predictor.predict(test_float_state_features)
     assert len(q_values[0].keys()) == 2
Esempio n. 5
0
def main(model_path):
    predictor = DQNPredictor.load(model_path, "minidb", int_features=False)

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS,
                                                             predictor,
                                                             test=True)

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS))
Esempio n. 6
0
def main(model_path):
    predictor = DQNPredictor.load(model_path, "minidb", int_features=False)

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(
        AVG_OVER_NUM_EPS, predictor, test=True
    )

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS
        )
    )