def test_model(self, episode=None): #no explore here if episode is None: episode = self.tested_episodes # Set models to eval mode self.transition_model.eval() self.observation_model.eval() self.reward_model.eval() self.encoder.eval() # Initialise parallelised test environments test_envs = EnvBatcher(ControlSuiteEnv, (self.parms.env_name, self.parms.seed, self.parms.max_episode_length, self.parms.bit_depth), {}, self.parms.test_episodes) total_steps = self.parms.max_episode_length // test_envs.action_repeat rewards = np.zeros(self.parms.test_episodes) real_rew = torch.zeros([total_steps,self.parms.test_episodes]) predicted_rew = torch.zeros([total_steps,self.parms.test_episodes]) with torch.no_grad(): observation, total_rewards, video_frames = test_envs.reset(), np.zeros((self.parms.test_episodes, )), [] belief, posterior_state, action = torch.zeros(self.parms.test_episodes, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.parms.state_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.env.action_size, device=self.parms.device) tqdm.write("Testing model.") for t in range(total_steps): belief, posterior_state, action, next_observation, rewards, done, pred_next_rew = self.update_belief_and_act(test_envs, belief, posterior_state, action, observation.to(device=self.parms.device), list(rewards), self.env.action_range[0], self.env.action_range[1]) total_rewards += rewards.numpy() real_rew[t] = rewards predicted_rew[t] = pred_next_rew observation = self.env.get_original_frame().unsqueeze(dim=0) video_frames.append(make_grid(torch.cat([observation, self.observation_model(belief, posterior_state).cpu()], dim=3) + 0.5, nrow=5).numpy()) # Decentre observation = next_observation if done.sum().item() == self.parms.test_episodes: break real_rew = torch.transpose(real_rew, 0, 1) predicted_rew = torch.transpose(predicted_rew, 0, 1) #save and plot metrics self.tested_episodes += 1 self.metrics['test_episodes'].append(episode) self.metrics['test_rewards'].append(total_rewards.tolist()) lineplot(self.metrics['test_episodes'], self.metrics['test_rewards'], 'test_rewards', self.statistics_path) write_video(video_frames, 'test_episode_%s' % str(episode), self.video_path) # Lossy compression # Set models to train mode self.transition_model.train() self.observation_model.train() self.reward_model.train() self.encoder.train() # Close test environments test_envs.close() return self.metrics
metrics['episodes'].append(episode) metrics['train_rewards'].append(total_reward) lineplot(metrics['episodes'][-len(metrics['train_rewards']):], metrics['train_rewards'], 'train_rewards', results_dir) print("Testing!") # Test model if episode % args.test_interval == 0: # Set models to eval mode transition_model.eval() observation_model.eval() reward_model.eval() encoder.eval() # Initialise parallelised test environments test_envs = EnvBatcher( Env, (args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth), {}, args.test_episodes) with torch.no_grad(): observation, total_rewards, video_frames = test_envs.reset( ), np.zeros((args.test_episodes, )), [] belief, posterior_state, action = torch.zeros( args.test_episodes, args.belief_size, device=args.device), torch.zeros( args.test_episodes, args.state_size, device=args.device), torch.zeros(args.test_episodes, env.action_size, device=args.device) pbar = tqdm(range(args.max_episode_length // args.action_repeat)) for t in pbar: belief, posterior_state, action, next_observation, reward, done = update_belief_and_act(
setup_my_seed(args) device = get_my_device(args) # Recorder results_dir = os.path.join('results', '{}_{}'.format(args.env, args.id)) os.makedirs(results_dir, exist_ok=True) writer = SummaryWriter(results_dir + "/{}_{}_log".format(args.env, args.id)) metrics = { 'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [], 'actor_loss': [], 'value_loss': [] } # Init training env env = Env(args.env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth) # Initialise parallelised test environments test_envs = EnvBatcher(Env, (args.env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth), {}, args.test_episodes) # Init replay buffer if args.experience_replay != '' and os.path.exists(args.experience_replay): D = torch.load(args.experience_replay) metrics['steps'] = [D.steps] * D.episodes metrics['episodes'] = list(range(1, D.episodes + 1)) elif not args.test: D = ExperienceReplay(args.experience_size, env.observation_size, env.action_size, args.bit_depth, device) # Init dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done)
def test(self, episode): print("Test model") # Set models to eval mode self.transition_model.eval() self.observation_model.eval() self.reward_model.eval() self.encoder.eval() self.algorithms.train_to_eval() # self.actor_model_g.eval() # self.value_model_g.eval() # Initialise parallelised test environments test_envs = EnvBatcher( Env, (args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth), {}, args.test_episodes) with torch.no_grad(): observation, total_rewards, video_frames = test_envs.reset( ), np.zeros((args.test_episodes, )), [] belief, posterior_state, action = torch.zeros( args.test_episodes, args.belief_size, device=args.device), torch.zeros( args.test_episodes, args.state_size, device=args.device), torch.zeros(args.test_episodes, self.env.action_size, device=args.device) pbar = tqdm(range(args.max_episode_length // args.action_repeat)) for t in pbar: belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act( args, test_envs, belief, posterior_state, action, observation.to(device=args.device)) total_rewards += reward.numpy() if not args.symbolic_env: # Collect real vs. predicted frames for video video_frames.append( make_grid(torch.cat([ observation, self.observation_model(belief, posterior_state).cpu() ], dim=3) + 0.5, nrow=5).numpy()) # Decentre observation = next_observation if done.sum().item() == args.test_episodes: pbar.close() break # Update and plot reward metrics (and write video if applicable) and save metrics self.metrics['test_episodes'].append(episode) self.metrics['test_rewards'].append(total_rewards.tolist()) Save_Txt(self.metrics['test_episodes'][-1], self.metrics['test_rewards'][-1], 'test_rewards', args.results_dir) # Save_Txt(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'],'test_rewards_steps', results_dir, xaxis='step') # lineplot(metrics['test_episodes'], metrics['test_rewards'], 'test_rewards', results_dir) # lineplot(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'], 'test_rewards_steps', results_dir, xaxis='step') if not args.symbolic_env: episode_str = str(episode).zfill(len(str(args.episodes))) write_video(video_frames, 'test_episode_%s' % episode_str, args.results_dir) # Lossy compression save_image( torch.as_tensor(video_frames[-1]), os.path.join(args.results_dir, 'test_episode_%s.png' % episode_str)) torch.save(self.metrics, os.path.join(args.results_dir, 'metrics.pth')) # Set models to train mode self.transition_model.train() self.observation_model.train() self.reward_model.train() self.encoder.train() # self.actor_model_g.train() # self.value_model_g.train() self.algorithms.eval_to_train() # Close test environments test_envs.close()
lineplot(metrics['episodes'][-len(metrics['train_rewards']):], metrics['train_rewards'], 'train_rewards', results_dir) # Test model if episode % args.test_interval == 0: # Set models to eval mode agent.transition_model.eval() agent.observation_model.eval() agent.reward_model.eval() agent.encoder.eval() agent.actor_model.eval() agent.value_model.eval() # Initialise parallelised test environments test_envs = EnvBatcher( Env, (args.env, args.symbolic, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth), {}, args.test_episodes) with torch.no_grad(): observation = test_envs.reset() total_rewards = np.zeros((args.test_episodes, )) video_frames = [] belief = torch.zeros(args.test_episodes, args.belief_size, device=args.device) posterior_state = torch.zeros(args.test_episodes, args.state_size, device=args.device) action = torch.zeros(args.test_episodes, env.action_size,