def evaluate_gym( env: str, model, eval_temperature: float, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): predictor = DiscreteDqnTorchPredictor(model) predictor.softmax_temperature = eval_temperature env = EnvFactory.make(env) policy = TorchPredictorPolicy(predictor) agent = Agent(policy=policy, action_extractor=lambda x: x.item()) rewards = [] for _ in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) rewards.append(ep_reward) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}, " f"which passes the bar of {passing_score_bar}!\n" f"List of rewards: {rewards}") assert avg_reward >= passing_score_bar
def _test_dqn_workflow(self, use_gpu=False, use_all_avail_gpus=False): """Run DQN workflow to ensure no crashes, algorithm correctness not tested here.""" with tempfile.TemporaryDirectory() as tmpdirname: lockfile = os.path.join(tmpdirname, "multiprocess_lock") Path(lockfile).touch() params = { "training_data_path": os.path.join( curr_dir, "test_data/discrete_action/cartpole_training.json.bz2"), "eval_data_path": os.path.join( curr_dir, "test_data/discrete_action/cartpole_eval.json.bz2"), "state_norm_data_path": os.path.join(curr_dir, "test_data/discrete_action/cartpole_norm.json"), "model_output_path": tmpdirname, "use_gpu": use_gpu, "use_all_avail_gpus": use_all_avail_gpus, "init_method": "file://" + lockfile, "num_nodes": 1, "node_index": 0, "actions": ["0", "1"], "epochs": 1, "rl": {}, "rainbow": { "double_q_learning": False, "dueling_architecture": False }, "training": { "minibatch_size": 128 }, } dqn_workflow.main(params) predictor_files = glob.glob(tmpdirname + "/model_*.torchscript") assert len( predictor_files) == 1, "Somehow created two predictor files!" predictor = DiscreteDqnTorchPredictor( torch.jit.load(predictor_files[0])) test_float_state_features = [{ "0": 1.0, "1": 1.0, "2": 1.0, "3": 1.0 }] q_values = predictor.predict(test_float_state_features) assert len(q_values[0].keys()) == 2
def test_predictor_torch_export(self): """Verify that q-values before model export equal q-values after model export. Meant to catch issues with export logic.""" environment = Gridworld() samples = Samples( mdp_ids=["0"], sequence_numbers=[0], sequence_number_ordinals=[1], states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}], actions=["D"], action_probabilities=[0.5], rewards=[0], possible_actions=[["R", "D"]], next_states=[{5: 1.0}], next_actions=["U"], terminals=[False], possible_next_actions=[["R", "U", "D"]], ) tdps = environment.preprocess_samples(samples, 1) assert len(tdps) == 1, "Invalid number of data pages" trainer = self.get_trainer(environment, {}, False, False, False) input = rlt.FeatureData(tdps[0].states) pre_export_q_values = trainer.q_network(input).detach().numpy() preprocessor = Preprocessor(environment.normalization, False) cpu_q_network = trainer.q_network.cpu_model() cpu_q_network.eval() dqn_with_preprocessor = DiscreteDqnWithPreprocessor(cpu_q_network, preprocessor) serving_module = DiscreteDqnPredictorWrapper( dqn_with_preprocessor, action_names=environment.ACTIONS ) with tempfile.TemporaryDirectory() as tmpdirname: buf = export_module_to_buffer(serving_module) tmp_path = os.path.join(tmpdirname, "model") with open(tmp_path, "wb") as f: f.write(buf.getvalue()) f.close() predictor = DiscreteDqnTorchPredictor(torch.jit.load(tmp_path)) post_export_q_values = predictor.predict([samples.states[0]]) for i, action in enumerate(environment.ACTIONS): self.assertAlmostEqual( float(pre_export_q_values[0][i]), float(post_export_q_values[0][action]), places=4, )
def create_predictor_policy_from_model(env: gym.Env, model: torch.nn.Module, **kwargs): if isinstance(env.action_space, gym.spaces.Discrete): assert "eval_temperature" in kwargs predictor = DiscreteDqnTorchPredictor(model) predictor.softmax_temperature = kwargs["eval_temperature"] return DiscreteDqnTorchPredictorPolicy(predictor) elif isinstance(env.action_space, gym.spaces.Box): assert len(env.action_space.shape) == 1 predictor = ActorTorchPredictor(model, action_feature_ids=list( range(env.action_space.shape[0]))) return ActorTorchPredictorPolicy(predictor) else: raise NotImplementedError(f"{env.action_space} not supported")
def main(model_path, temperature): model_path = glob.glob(model_path)[0] predictor = DiscreteDqnTorchPredictor(torch.jit.load(model_path)) predictor.softmax_temperature = temperature env = OpenAIGymEnvironment(gymenv=ENV) avg_rewards, avg_discounted_rewards = env.run_ep_n_times( AVG_OVER_NUM_EPS, predictor, test=True ) logger.info( "Achieved an average reward score of {} over {} evaluations.".format( avg_rewards, AVG_OVER_NUM_EPS ) )
def get_predictor(self, trainer, environment): state_preprocessor = Preprocessor(environment.normalization, False) q_network = trainer.q_network if isinstance(trainer, QRDQNTrainer): class _Mean(torch.nn.Module): def forward(self, input): assert input.ndim == 3 return input.mean(dim=2) q_network = models.Sequential(q_network, _Mean()) dqn_with_preprocessor = DiscreteDqnWithPreprocessor( q_network.cpu_model().eval(), state_preprocessor ) serving_module = DiscreteDqnPredictorWrapper( dqn_with_preprocessor=dqn_with_preprocessor, action_names=environment.ACTIONS, ) predictor = DiscreteDqnTorchPredictor(serving_module) return predictor