def eval(self, n=None, **kwargs): start_time = perf_counter() returns = [] horizons = [] for _ in range(self.config['eval.num_episode']): observation = self.eval_env.reset() for _ in range(self.eval_env.spec.max_episode_steps): with torch.no_grad(): action = self.agent.choose_action(observation, mode='eval')['action'] next_observation, reward, done, info = self.eval_env.step(action) if done[0]: # [0] single environment returns.append(info[0]['episode']['return']) horizons.append(info[0]['episode']['horizon']) break observation = next_observation logger = Logger() logger('num_seconds', round(perf_counter() - start_time, 1)) logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps']) logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes']) logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(self.eval_env, 'VecMonitor') logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green')) return logger.logs
def checkpoint(self, logdir, num_iter): self.save(logdir / f'agent_{num_iter}.pth') obs_env = get_wrapper(self.env, 'VecStandardizeObservation') if obs_env is not None: pickle_dump(obj=(obs_env.mean, obs_env.var), f=logdir / f'obs_moments_{num_iter}', ext='.pth')
def test_get_wrapper(env_id): def make_env(): return gym.make(env_id) env = make_env() env = ClipReward(env, 0.1, 0.5) env = FlattenObservation(env) env = FrameStack(env, 4) assert get_wrapper(env, 'ClipReward').__class__.__name__ == 'ClipReward' assert get_wrapper( env, 'FlattenObservation').__class__.__name__ == 'FlattenObservation' assert get_wrapper(env, 'Env') is None del env # vec_env env = make_vec_env(make_env, 3, 0) env = VecMonitor(env) assert get_wrapper(env, 'VecMonitor').__class__.__name__ == 'VecMonitor' assert get_wrapper(env, 'ClipReward') is None
def train(self, n=None, **kwargs): self.agent.train() start_time = perf_counter() D = self.runner(self.agent, self.env, self.config['train.timestep_per_iter']) out_agent = self.agent.learn(D) logger = Logger() logger('train_iteration', n + 1) logger('num_seconds', round(perf_counter() - start_time, 1)) [logger(key, value) for key, value in out_agent.items()] logger('num_trajectories', len(D)) logger('num_timesteps', sum([len(traj) for traj in D])) logger('accumulated_trained_timesteps', self.agent.total_timestep) G = [traj.numpy_rewards.sum() for traj in D] logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n')) infos = [ info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info ] online_returns = [info['episode']['return'] for info in infos] online_horizons = [info['episode']['horizon'] for info in infos] logger( 'online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(self.env, 'VecMonitor') logger( 'running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) return logger
def evaluator(config, logdir, seed, make_env, learner_agent): torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK eval_logs = [] env = make_env(config, seed, 'train') agent = Agent(config, env, torch.device('cpu')) runner = EpisodeRunner(reset_on_call=True) evaluated_steps = config['eval.freq'] while learner_agent.total_timestep < config['train.timestep']: if learner_agent.total_timestep < evaluated_steps: time.sleep(1.0) else: t0 = time.perf_counter() agent.load_state_dict( learner_agent.state_dict()) # copy to CPU by default with torch.no_grad(): D = [] for _ in range(config['eval.num_episode']): D += runner(agent, env, env.spec.max_episode_steps) logger = Logger() logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('num_trajectories', len(D)) logger('num_timesteps', sum([len(traj) for traj in D])) logger('accumulated_trained_timesteps', learner_agent.total_timestep) infos = [ info for info in chain.from_iterable([traj.infos for traj in D]) if 'episode' in info ] online_returns = [info['episode']['return'] for info in infos] online_horizons = [info['episode']['horizon'] for info in infos] logger( 'online_return', describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'online_horizon', describe(online_horizons, axis=-1, repr_indent=1, repr_prefix='\n')) monitor_env = get_wrapper(env, 'VecMonitor') logger( 'running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger( 'running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n')) logger.dump(keys=None, index=0, indent=0, border=color_str('+' * 50, color='green')) eval_logs.append(logger.logs) evaluated_steps += config['eval.freq'] pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')