def evaluate( dt: float, epoch: int, env: Env, agent: Agent, eval_gap: float, # noqa: C901 time_limit: Optional[float] = None, eval_return: bool = False, progress_bar: bool = False, video: bool = False, no_log: bool = False, test: bool = False, eval_policy: bool = True) -> Optional[float]: """Evaluate agent in environment. :args dt: time discretization :args epoch: index of the current epoch :args env: environment :args agent: interacting agent :args eval_gap: number of normalized epochs (epochs divided by dt) between training steps :args time_limit: maximal physical time (number of steps divided by dt) spent in the environment :args eval_return: do we only perform specific evaluation? :args progress_bar: use a progress bar? :args video: log a video of the interaction? :args no_log: do we log results :args test: log to a different test summary :args eval_policy: if the exploitation policy is noisy, remove the noise before evaluating :return: return evaluated, None if no return is evaluated """ log_gap = int(eval_gap / dt) agent.eval() if not eval_policy and isinstance(agent, OnlineAgent): agent.noisy_eval() agent.reset() R = None if eval_return: rewards, dones = [], [] imgs = [] time_limit = time_limit if time_limit else 10 nb_steps = int(time_limit / dt) info(f"eval> evaluating on a physical time {time_limit}" f" ({nb_steps} steps in total)") obs = env.reset() iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps) for _ in iter_range: obs, reward, done = interact(env, agent, obs) rewards.append(reward) dones.append(done) if video: imgs.append(env.render(mode='rgb_array')) R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0)) tag = "noisy" if not eval_policy else "" info(f"eval> At epoch {epoch}, {tag} return: {R}") if not no_log: if not eval_policy: log("Return_noisy", R, epoch) elif not video: # don't log when outputing video if not test: log("Return", R, epoch) else: log("Return_test", R, epoch) if video: log_video("demo", epoch, np.stack(imgs, axis=0)) if not no_log: specific_evaluation(epoch, log_gap, dt, env, agent) return R
def main(args): if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) args.__dict__ = update_env_kwargs(args.__dict__) # Update env_kwargs symbolic = args.env_kwargs['observation_mode'] != 'cam_rgb' args.encoder_type = 'identity' if symbolic else 'pixel' env = Env(args.env_name, symbolic, args.seed, 200, 1, 8, args.pre_transform_image_size, env_kwargs=args.env_kwargs, normalize_observation=False, scale_reward=args.scale_reward, clip_obs=args.clip_obs) env.seed(args.seed) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) args.work_dir = logger.get_dir() video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3, args.image_size, args.image_size) pre_aug_obs_shape = (3, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb, chester_logger=logger) episode, episode_reward, done, ep_info = 0, 0, True, [] start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video_dir, args.num_eval_episodes, L, step, args) if args.save_model and (step % (args.eval_freq * 5) == 0): agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) for key, val in get_info_stats([ep_info]).items(): L.log('train/info_' + key, val, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False ep_info = [] episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, info = env.step(action) # allow infinit bootstrap ep_info.append(info) done_bool = 0 if episode_step + 1 == env.horizon else float(done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
else: args.device = torch.device('cpu') metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': []} # Initialise training environment and experience replay memory env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_dpth) if args.experience_replay is not '' and os.path.exists(args.experience_replay): D = torch.load(args.experience_replay) metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list(range(1, D.episodes + 1)) elif not args.test: D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done) observation = next_observation t += 1 metrics['steps'].append(t * args.action_repeat + (0 if len(metrics['steps']) == 0 else metrics['steps'][-1])) metrics['episodes'].append(s) # Initialise model parameters randomly transition_model = TransitionModel(args.belief_size, args.state_size, env.action_size, args.hidden_size, args.embedding_size, args.activation_function).to(device=args.device) observation_model = ObservationModel(args.symbolic_env, env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.activation_function).to(device=args.device) reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size, args.activation_function).to(