def evaluate( dt: float, epoch: int, env: Env, agent: Agent, eval_gap: float, # noqa: C901 time_limit: Optional[float] = None, eval_return: bool = False, progress_bar: bool = False, video: bool = False, no_log: bool = False, test: bool = False, eval_policy: bool = True) -> Optional[float]: """Evaluate agent in environment. :args dt: time discretization :args epoch: index of the current epoch :args env: environment :args agent: interacting agent :args eval_gap: number of normalized epochs (epochs divided by dt) between training steps :args time_limit: maximal physical time (number of steps divided by dt) spent in the environment :args eval_return: do we only perform specific evaluation? :args progress_bar: use a progress bar? :args video: log a video of the interaction? :args no_log: do we log results :args test: log to a different test summary :args eval_policy: if the exploitation policy is noisy, remove the noise before evaluating :return: return evaluated, None if no return is evaluated """ log_gap = int(eval_gap / dt) agent.eval() if not eval_policy and isinstance(agent, OnlineAgent): agent.noisy_eval() agent.reset() R = None if eval_return: rewards, dones = [], [] imgs = [] time_limit = time_limit if time_limit else 10 nb_steps = int(time_limit / dt) info(f"eval> evaluating on a physical time {time_limit}" f" ({nb_steps} steps in total)") obs = env.reset() iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps) for _ in iter_range: obs, reward, done = interact(env, agent, obs) rewards.append(reward) dones.append(done) if video: imgs.append(env.render(mode='rgb_array')) R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0)) tag = "noisy" if not eval_policy else "" info(f"eval> At epoch {epoch}, {tag} return: {R}") if not no_log: if not eval_policy: log("Return_noisy", R, epoch) elif not video: # don't log when outputing video if not test: log("Return", R, epoch) else: log("Return_test", R, epoch) if video: log_video("demo", epoch, np.stack(imgs, axis=0)) if not no_log: specific_evaluation(epoch, log_gap, dt, env, agent) return R
belief, posterior_state, action = torch.zeros(1, args.belief_size, device=args.device), torch.zeros(1, args.state_size, device=args.device), torch.zeros( 1, env.action_size, device=args.device) pbar = tqdm(range(args.max_episode_length // args.action_repeat)) for t in pbar: belief, posterior_state, action, observation, reward, done = update_belief_and_act(args, env, planner, transition_model, encoder, belief, posterior_state, action, observation.to( device=args.device)) total_reward += reward if args.render: env.render() if done: pbar.close() break print('Average Reward:', total_reward / args.test_episodes) env.close() quit() # Training (and testing) for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=metrics['episodes'][-1] + 1): # Model fitting losses = [] for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample(args.batch_size,