def run(config, seed, device): set_global_seeds(seed) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) env = make_env(config, seed) env = VecMonitor(env) if config['env.standardize_obs']: env = VecStandardizeObservation(env, clip=5.) if config['env.standardize_reward']: env = VecStandardizeReward(env, clip=10., gamma=config['agent.gamma']) agent = Agent(config, env, device) runner = EpisodeRunner(reset_on_call=False) engine = Engine(config, agent=agent, env=env, runner=runner) train_logs = [] for i in count(): if agent.total_timestep >= config['train.timestep']: break train_logger = engine.train(i) train_logs.append(train_logger.logs) if i == 0 or (i+1) % config['log.freq'] == 0: train_logger.dump(keys=None, index=0, indent=0, border='-'*50) if i == 0 or (i+1) % config['checkpoint.freq'] == 0: agent.checkpoint(logdir, i + 1) agent.checkpoint(logdir, i + 1) pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') return None
def test_episode_runner(env_id, num_env, init_seed, T): if env_id == 'Sanity': make_env = lambda: TimeLimit(SanityEnv()) else: make_env = lambda: gym.make(env_id) env = make_vec_env(make_env, num_env, init_seed) env = VecStepInfo(env) agent = RandomAgent(None, env, None) runner = EpisodeRunner() if num_env > 1: with pytest.raises(AssertionError): D = runner(agent, env, T) else: with pytest.raises(AssertionError): runner(agent, env.env, T) # must be VecStepInfo D = runner(agent, env, T) for traj in D: assert isinstance(traj, Trajectory) assert len(traj) <= env.spec.max_episode_steps assert traj.numpy_observations.shape == (len(traj) + 1, *env.observation_space.shape) if isinstance(env.action_space, gym.spaces.Discrete): assert traj.numpy_actions.shape == (len(traj),) else: assert traj.numpy_actions.shape == (len(traj), *env.action_space.shape) assert traj.numpy_rewards.shape == (len(traj),) assert traj.numpy_dones.shape == (len(traj), ) assert traj.numpy_masks.shape == (len(traj), ) assert len(traj.step_infos) == len(traj) if traj.completed: assert np.allclose(traj.observations[-1], traj.step_infos[-1]['last_observation'])
def eval(self, n): self.agent.eval() start_time = time() if self.config['env.standardize']: eval_env = VecStandardize( venv=self.eval_env, use_obs=True, use_reward=False, # do not process rewards, no training clip_obs=self.runner.env.clip_obs, clip_reward=self.runner.env.clip_reward, gamma=self.runner.env.gamma, eps=self.runner.env.eps, constant_obs_mean=self.runner.env.obs_runningavg.mu, constant_obs_std=self.runner.env.obs_runningavg.sigma) eval_runner = EpisodeRunner(self.config, self.agent, eval_env) T = eval_env.T D = eval_runner(T) eval_output = {} eval_output['D'] = D eval_output['n'] = n eval_output['T'] = T eval_output['num_sec'] = time() - start_time return eval_output
def eval(self, n): self.agent.eval() eval_runner = EpisodeRunner(self.config, self.agent, self.eval_env) T = self.eval_env.T D = eval_runner(T) eval_output = {} eval_output['D'] = D eval_output['n'] = n eval_output['T'] = T return eval_output
def test_episode_runner(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) if env_id == 'CartPole-v1': sticky_action = 1 elif env_id == 'Pendulum-v0': sticky_action = [0.1] T = 30 agent = StickyAgent(None, env_spec, sticky_action) runner = EpisodeRunner(None, agent, env) D = runner(T) assert D.N == 3 assert D.maxT == max(D.Ts) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(T): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def f(self, config, solution): if self.agent is None: self._prepare(config) solution = torch.from_numpy(np.asarray(solution)).float().to( self.device) assert solution.numel() == self.agent.num_params # Load solution params to agent self.agent.from_vec(solution) runner = EpisodeRunner(config, self.agent, self.env) with torch.no_grad(): D = runner(self.env_spec.T) mean_return = D.numpy_rewards.sum(-1).mean() # ES does minimization, so use negative returns function_value = -mean_return return function_value
def __call__(self, config, seed, device): set_global_seeds(seed) logdir = Path(config['log.dir']) / str(config['ID']) / str(seed) if config['env.time_aware_obs']: kwargs = {'extra_wrapper': [TimeAwareObservation]} else: kwargs = {} env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['train.N'], seed, monitor=True, **kwargs) if config['eval.independent']: eval_env = make_vec_env(SerialVecEnv, make_gym_env, config['env.id'], config['eval.N'], seed) if config['env.clip_action']: env = VecClipAction(env) if config['eval.independent']: eval_env = VecClipAction(eval_env) if config[ 'env.standardize']: # running averages of observation and reward env = VecStandardize(venv=env, use_obs=True, use_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, eps=1e-8) env_spec = EnvSpec(env) agent = Agent(config, env_spec, device) runner = EpisodeRunner(config, agent, env) if config['eval.independent']: engine = Engine(agent, runner, config, eval_env=eval_env) else: engine = Engine(agent, runner, config) train_logs = [] eval_logs = [] for i in count(): if 'train.iter' in config and i >= config[ 'train.iter']: # enough iterations break elif 'train.timestep' in config and agent.total_T >= config[ 'train.timestep']: # enough timesteps break train_output = engine.train(i) if i == 0 or (i + 1) % config['log.interval'] == 0: train_log = engine.log_train(train_output) train_logs.append(train_log) if config['eval.independent']: with torch.no_grad(): # disable grad, save memory eval_output = engine.eval(n=i) eval_log = engine.log_eval(eval_output) eval_logs.append(eval_log) pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl') return None