Ejemplo n.º 1
0
    def make_model(self, env):
        hidden_size = 50

        obs_size = env.observation_space.low.size
        v = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
        )

        if self.discrete:
            n_actions = env.action_space.n
            pi = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, n_actions),
                SoftmaxCategoricalHead(),
            )
        else:
            action_size = env.action_space.low.size
            pi = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, action_size),
                GaussianHeadWithStateIndependentCovariance(
                    action_size=action_size,
                    var_type="diagonal",
                    var_func=lambda x: torch.exp(2 * x),
                    var_param_init=0,
                ),
            )

        return Branched(pi, v)
Ejemplo n.º 2
0
    def _test_load_ppo(self, gpu):
        obs_size = 11
        action_size = 3
        from pfrl.policies import GaussianHeadWithStateIndependentCovariance

        policy = torch.nn.Sequential(
            nn.Linear(obs_size, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, action_size),
            GaussianHeadWithStateIndependentCovariance(
                action_size=action_size,
                var_type="diagonal",
                var_func=lambda x: torch.exp(2 * x),  # Parameterize log std
                var_param_init=0,  # log std = 0 => std = 1
            ),
        )

        vf = torch.nn.Sequential(
            nn.Linear(obs_size, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1),
        )

        model = pnn.Branched(policy, vf)
        opt = torch.optim.Adam(model.parameters(), lr=3e-4, eps=1e-5)

        agent = agents.PPO(
            model,
            opt,
            obs_normalizer=None,
            gpu=gpu,
            update_interval=2048,
            minibatch_size=64,
            epochs=10,
            clip_eps_vf=None,
            entropy_coef=0,
            standardize_advantages=True,
            gamma=0.995,
            lambd=0.97,
        )

        downloaded_model, exists = download_model("PPO",
                                                  "Hopper-v2",
                                                  model_type="final")
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Ejemplo n.º 3
0
    def make_model(self, env):
        hidden_size = 20
        obs_size = env.observation_space.low.size

        def weight_scale(layer, scale):
            with torch.no_grad():
                layer.weight.mul_(scale)
            return layer

        if self.recurrent:
            v = RecurrentSequential(
                nn.LSTM(num_layers=1,
                        input_size=obs_size,
                        hidden_size=hidden_size),
                weight_scale(nn.Linear(hidden_size, 1), 1e-1),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = RecurrentSequential(
                    nn.LSTM(num_layers=1,
                            input_size=obs_size,
                            hidden_size=hidden_size),
                    weight_scale(nn.Linear(hidden_size, n_actions), 1e-1),
                    SoftmaxCategoricalHead(),
                )
            else:
                action_size = env.action_space.low.size
                pi = RecurrentSequential(
                    nn.LSTM(num_layers=1,
                            input_size=obs_size,
                            hidden_size=hidden_size),
                    weight_scale(nn.Linear(hidden_size, action_size), 1e-1),
                    GaussianHeadWithStateIndependentCovariance(
                        action_size=action_size,
                        var_type="diagonal",
                        var_func=lambda x: torch.exp(2 * x),
                        var_param_init=0,
                    ),
                )
            return RecurrentBranched(pi, v)
        else:
            v = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                weight_scale(nn.Linear(hidden_size, 1), 1e-1),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = nn.Sequential(
                    nn.Linear(obs_size, hidden_size),
                    nn.Tanh(),
                    weight_scale(nn.Linear(hidden_size, n_actions), 1e-1),
                    SoftmaxCategoricalHead(),
                )
            else:
                action_size = env.action_space.low.size
                pi = nn.Sequential(
                    nn.Linear(obs_size, hidden_size),
                    nn.Tanh(),
                    weight_scale(nn.Linear(hidden_size, action_size), 1e-1),
                    GaussianHeadWithStateIndependentCovariance(
                        action_size=action_size,
                        var_type="diagonal",
                        var_func=lambda x: torch.exp(2 * x),
                        var_param_init=0,
                    ),
                )
            return pfrl.nn.Branched(pi, v)
Ejemplo n.º 4
0
    def _test_abc(
        self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1
    ):
        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=True,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        hidden_size = 20
        obs_size = obs_space.low.size
        if discrete:
            output_size = action_space.n
            head = SoftmaxCategoricalHead()
        else:
            output_size = action_space.low.size
            head = GaussianHeadWithStateIndependentCovariance(
                output_size, var_type="diagonal"
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.LSTM(
                    num_layers=1,
                    input_size=obs_size,
                    hidden_size=hidden_size,
                ),
                nn.Linear(hidden_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        opt = torch.optim.Adam(model.parameters())
        beta = 1e-2
        agent = pfrl.agents.REINFORCE(
            model,
            opt,
            gpu=gpu,
            beta=beta,
            batchsize=self.batchsize,
            backward_separately=self.backward_separately,
            act_deterministically=True,
            recurrent=use_lstm,
        )

        pfrl.experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(0, False),
            eval_env=make_env(0, True),
            outdir=self.outdir,
            steps=steps,
            train_max_episode_len=2,
            eval_interval=500,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=1,
        )

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns, _ = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
        )
        if require_success:
            successful_return = 1
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs