def _test_abc( self, t_max, recurrent, discrete=True, episodic=True, steps=100000, require_success=True, ): nproc = 8 def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.recurrent, deterministic=test, ) env = make_env(0, False) model = self.make_model(env) from pfrl.optimizers import SharedRMSpropEpsInsideSqrt opt = SharedRMSpropEpsInsideSqrt(model.parameters()) gamma = 0.8 beta = 1e-2 agent = a3c.A3C( model, opt, t_max=t_max, gamma=gamma, beta=beta, act_deterministically=True, max_grad_norm=1.0, recurrent=recurrent, ) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async( outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, "successful")) # Test env = make_env(0, True) n_test_runs = 5 eval_returns = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) successful_return = 1 if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs
def _test_abc( self, t_max, use_lstm, discrete=True, episodic=True, steps=100000, require_success=True, ): nproc = 8 def make_env(process_idx, test): size = 2 return ABC( size=size, discrete=discrete, episodic=episodic or test, partially_observable=self.use_lstm, deterministic=test, ) sample_env = make_env(0, False) action_space = sample_env.action_space obs_space = sample_env.observation_space replay_buffer = EpisodicReplayBuffer(10**4) obs_size = obs_space.low.size hidden_size = 20 if discrete: n_actions = action_space.n head = acer.ACERDiscreteActionHead( pi=nn.Sequential( nn.Linear(hidden_size, n_actions), SoftmaxCategoricalHead(), ), q=nn.Sequential( nn.Linear(hidden_size, n_actions), DiscreteActionValueHead(), ), ) else: action_size = action_space.low.size head = acer.ACERContinuousActionHead( pi=nn.Sequential( nn.Linear(hidden_size, action_size * 2), GaussianHeadWithDiagonalCovariance(), ), v=nn.Sequential(nn.Linear(hidden_size, 1), ), adv=nn.Sequential( ConcatObsAndAction(), nn.Linear(hidden_size + action_size, 1), ), ) if use_lstm: model = pfrl.nn.RecurrentSequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), nn.LSTM(num_layers=1, input_size=hidden_size, hidden_size=hidden_size), head, ) else: model = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.LeakyReLU(), head, ) eps = 1e-8 opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(), lr=1e-3, eps=eps, alpha=0.99) gamma = 0.5 beta = 1e-5 if self.n_times_replay == 0 and self.disable_online_update: # At least one of them must be enabled pytest.skip() agent = acer.ACER( model, opt, replay_buffer=replay_buffer, t_max=t_max, gamma=gamma, beta=beta, n_times_replay=self.n_times_replay, act_deterministically=True, disable_online_update=self.disable_online_update, replay_start_size=100, use_trust_region=self.use_trust_region, recurrent=use_lstm, ) max_episode_len = None if episodic else 2 with warnings.catch_warnings(record=True) as warns: train_agent_async( outdir=self.outdir, processes=nproc, make_env=make_env, agent=agent, steps=steps, max_episode_len=max_episode_len, eval_interval=500, eval_n_steps=None, eval_n_episodes=5, successful_score=1, ) assert len(warns) == 0, warns[0] # The agent returned by train_agent_async is not guaranteed to be # successful because parameters could be modified by other processes # after success. Thus here the successful model is loaded explicitly. if require_success: agent.load(os.path.join(self.outdir, "successful")) # Test env = make_env(0, True) n_test_runs = 5 eval_returns = run_evaluation_episodes( env, agent, n_steps=None, n_episodes=n_test_runs, max_episode_len=max_episode_len, ) successful_return = 1 if require_success: n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return) assert n_succeeded == n_test_runs