コード例 #1
0
ファイル: gym_batch_rl.py プロジェクト: jayhsieh/ReAgent
def evaluate_gym(
    env: str,
    model,
    eval_temperature: float,
    num_eval_episodes: int,
    passing_score_bar: float,
    max_steps: Optional[int] = None,
):
    predictor = DiscreteDqnTorchPredictor(model)
    predictor.softmax_temperature = eval_temperature

    env = EnvFactory.make(env)
    policy = TorchPredictorPolicy(predictor)
    agent = Agent(policy=policy, action_extractor=lambda x: x.item())

    rewards = []
    for _ in range(num_eval_episodes):
        ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps)
        rewards.append(ep_reward)

    avg_reward = np.mean(rewards)
    logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}, "
                f"which passes the bar of {passing_score_bar}!\n"
                f"List of rewards: {rewards}")
    assert avg_reward >= passing_score_bar
コード例 #2
0
ファイル: test_oss_workflows.py プロジェクト: zwcdp/ReAgent
 def _test_dqn_workflow(self, use_gpu=False, use_all_avail_gpus=False):
     """Run DQN workflow to ensure no crashes, algorithm correctness
     not tested here."""
     with tempfile.TemporaryDirectory() as tmpdirname:
         lockfile = os.path.join(tmpdirname, "multiprocess_lock")
         Path(lockfile).touch()
         params = {
             "training_data_path":
             os.path.join(
                 curr_dir,
                 "test_data/discrete_action/cartpole_training.json.bz2"),
             "eval_data_path":
             os.path.join(
                 curr_dir,
                 "test_data/discrete_action/cartpole_eval.json.bz2"),
             "state_norm_data_path":
             os.path.join(curr_dir,
                          "test_data/discrete_action/cartpole_norm.json"),
             "model_output_path":
             tmpdirname,
             "use_gpu":
             use_gpu,
             "use_all_avail_gpus":
             use_all_avail_gpus,
             "init_method":
             "file://" + lockfile,
             "num_nodes":
             1,
             "node_index":
             0,
             "actions": ["0", "1"],
             "epochs":
             1,
             "rl": {},
             "rainbow": {
                 "double_q_learning": False,
                 "dueling_architecture": False
             },
             "training": {
                 "minibatch_size": 128
             },
         }
         dqn_workflow.main(params)
         predictor_files = glob.glob(tmpdirname + "/model_*.torchscript")
         assert len(
             predictor_files) == 1, "Somehow created two predictor files!"
         predictor = DiscreteDqnTorchPredictor(
             torch.jit.load(predictor_files[0]))
         test_float_state_features = [{
             "0": 1.0,
             "1": 1.0,
             "2": 1.0,
             "3": 1.0
         }]
         q_values = predictor.predict(test_float_state_features)
         assert len(q_values[0].keys()) == 2
コード例 #3
0
    def test_predictor_torch_export(self):
        """Verify that q-values before model export equal q-values after
        model export. Meant to catch issues with export logic."""
        environment = Gridworld()
        samples = Samples(
            mdp_ids=["0"],
            sequence_numbers=[0],
            sequence_number_ordinals=[1],
            states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}],
            actions=["D"],
            action_probabilities=[0.5],
            rewards=[0],
            possible_actions=[["R", "D"]],
            next_states=[{5: 1.0}],
            next_actions=["U"],
            terminals=[False],
            possible_next_actions=[["R", "U", "D"]],
        )
        tdps = environment.preprocess_samples(samples, 1)
        assert len(tdps) == 1, "Invalid number of data pages"

        trainer = self.get_trainer(environment, {}, False, False, False)
        input = rlt.FeatureData(tdps[0].states)

        pre_export_q_values = trainer.q_network(input).detach().numpy()

        preprocessor = Preprocessor(environment.normalization, False)
        cpu_q_network = trainer.q_network.cpu_model()
        cpu_q_network.eval()
        dqn_with_preprocessor = DiscreteDqnWithPreprocessor(cpu_q_network, preprocessor)
        serving_module = DiscreteDqnPredictorWrapper(
            dqn_with_preprocessor, action_names=environment.ACTIONS
        )

        with tempfile.TemporaryDirectory() as tmpdirname:
            buf = export_module_to_buffer(serving_module)
            tmp_path = os.path.join(tmpdirname, "model")
            with open(tmp_path, "wb") as f:
                f.write(buf.getvalue())
                f.close()
                predictor = DiscreteDqnTorchPredictor(torch.jit.load(tmp_path))

        post_export_q_values = predictor.predict([samples.states[0]])

        for i, action in enumerate(environment.ACTIONS):
            self.assertAlmostEqual(
                float(pre_export_q_values[0][i]),
                float(post_export_q_values[0][action]),
                places=4,
            )
コード例 #4
0
def create_predictor_policy_from_model(env: gym.Env, model: torch.nn.Module,
                                       **kwargs):
    if isinstance(env.action_space, gym.spaces.Discrete):
        assert "eval_temperature" in kwargs
        predictor = DiscreteDqnTorchPredictor(model)
        predictor.softmax_temperature = kwargs["eval_temperature"]
        return DiscreteDqnTorchPredictorPolicy(predictor)
    elif isinstance(env.action_space, gym.spaces.Box):
        assert len(env.action_space.shape) == 1
        predictor = ActorTorchPredictor(model,
                                        action_feature_ids=list(
                                            range(env.action_space.shape[0])))
        return ActorTorchPredictorPolicy(predictor)
    else:
        raise NotImplementedError(f"{env.action_space} not supported")
コード例 #5
0
ファイル: eval_cartpole.py プロジェクト: zwcdp/ReAgent
def main(model_path, temperature):
    model_path = glob.glob(model_path)[0]
    predictor = DiscreteDqnTorchPredictor(torch.jit.load(model_path))
    predictor.softmax_temperature = temperature

    env = OpenAIGymEnvironment(gymenv=ENV)

    avg_rewards, avg_discounted_rewards = env.run_ep_n_times(
        AVG_OVER_NUM_EPS, predictor, test=True
    )

    logger.info(
        "Achieved an average reward score of {} over {} evaluations.".format(
            avg_rewards, AVG_OVER_NUM_EPS
        )
    )
コード例 #6
0
    def get_predictor(self, trainer, environment):
        state_preprocessor = Preprocessor(environment.normalization, False)
        q_network = trainer.q_network
        if isinstance(trainer, QRDQNTrainer):

            class _Mean(torch.nn.Module):
                def forward(self, input):
                    assert input.ndim == 3
                    return input.mean(dim=2)

            q_network = models.Sequential(q_network, _Mean())

        dqn_with_preprocessor = DiscreteDqnWithPreprocessor(
            q_network.cpu_model().eval(), state_preprocessor
        )
        serving_module = DiscreteDqnPredictorWrapper(
            dqn_with_preprocessor=dqn_with_preprocessor,
            action_names=environment.ACTIONS,
        )
        predictor = DiscreteDqnTorchPredictor(serving_module)
        return predictor