def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost( environment, reward_boost, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) predictor = trainer.predictor() samples = environment.generate_samples(100000, 1.0, DISCOUNT) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for tdp in tdps: trainer.train(tdp, None) predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_predictor_export(self): """Verify that q-values before model export equal q-values after model export. Meant to catch issues with export logic.""" environment = Gridworld() trainer = trainer = self.get_sarsa_trainer(environment, False) samples = Samples( mdp_ids=["0"], sequence_numbers=[0], states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}], actions=["D"], action_probabilities=[0.5], rewards=[0], possible_actions=[["R", "D"]], next_states=[{5: 1.0}], next_actions=["U"], terminals=[False], possible_next_actions=[["R", "U", "D"]], ) tdps = environment.preprocess_samples(samples, 1) pre_export_q_values = trainer.q_network(tdps[0].states).detach().numpy() predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) post_export_q_values = new_predictor.predict([samples.states[0]]) for i, action in enumerate(environment.ACTIONS): self.assertAlmostEquals( pre_export_q_values[0][i], post_export_q_values[0][action], places=4 )
def _test_dqn_workflow(self, use_gpu=False, use_all_avail_gpus=False): """Run DQN workflow to ensure no crashes, algorithm correctness not tested here.""" with tempfile.TemporaryDirectory() as tmpdirname: lockfile = os.path.join(tmpdirname, "multiprocess_lock") Path(lockfile).touch() params = { "training_data_path": os.path.join( curr_dir, "test_data/discrete_action/cartpole_training.json.bz2" ), "eval_data_path": os.path.join( curr_dir, "test_data/discrete_action/cartpole_eval.json.bz2" ), "state_norm_data_path": os.path.join( curr_dir, "test_data/discrete_action/cartpole_norm.json" ), "model_output_path": tmpdirname, "use_gpu": use_gpu, "use_all_avail_gpus": use_all_avail_gpus, "init_method": "file://" + lockfile, "num_nodes": 1, "node_index": 0, "actions": ["0", "1"], "epochs": 1, "rl": {}, "rainbow": {"double_q_learning": False, "dueling_architecture": False}, "training": {"minibatch_size": 128}, } dqn_workflow.main(params) predictor_files = glob.glob(tmpdirname + "/predictor_*.c2") assert len(predictor_files) == 1, "Somehow created two predictor files!" predictor = DQNPredictor.load(predictor_files[0], "minidb") test_float_state_features = [{"0": 1.0, "1": 1.0, "2": 1.0, "3": 1.0}] q_values = predictor.predict(test_float_state_features) assert len(q_values[0].keys()) == 2
def test_read_c2_model_from_file(self): """Test reading output caffe2 model from file and using it for inference.""" path = os.path.join(curr_dir, "test_data/discrete_action/example_predictor.c2") predictor = DQNPredictor.load(path, "minidb") test_float_state_features = [{"0": 1.0, "1": 1.0, "2": 1.0, "3": 1.0}] q_values = predictor.predict(test_float_state_features) assert len(q_values[0].keys()) == 2
def main(model_path): predictor = DQNPredictor.load(model_path, "minidb", int_features=False) env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times(AVG_OVER_NUM_EPS, predictor, test=True) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS))
def main(model_path): predictor = DQNPredictor.load(model_path, "minidb", int_features=False) env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times( AVG_OVER_NUM_EPS, predictor, test=True ) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS ) )