def eval(self, checkpoint_path): r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() self.env = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic # get name of performance metric, e.g. "spl" metric_name = self.config.TASK_CONFIG.TASK.MEASUREMENTS[0] metric_cfg = getattr(self.config.TASK_CONFIG.TASK, metric_name) measure_type = baseline_registry.get_measure(metric_cfg.TYPE) assert measure_type is not None, "invalid measurement type {}".format( metric_cfg.TYPE) self.metric_uuid = measure_type(sim=None, task=None, config=None)._get_uuid() observations = self.env.reset() batch = batch_obs(observations, self.device) current_episode_reward = torch.zeros(self.env.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) self.actor_critic.eval() while (len(stats_episodes) < self.config.TEST_EPISODE_COUNT and self.env.num_envs > 0): current_episodes = self.env.current_episodes() with torch.no_grad(): ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) outputs = self.env.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, self.device) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.env.current_episodes() envs_to_pause = [] n_envs = self.env.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not_done_masks[i].item() == 0: episode_stats = dict() episode_stats[self.metric_uuid] = infos[i][ self.metric_uuid] episode_stats["success"] = int( infos[i][self.metric_uuid] > 0) episode_stats["reward"] = current_episode_reward[i].item() current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, )] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=0, metric_name=self.metric_uuid, metric_value=infos[i][self.metric_uuid], ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) rgb_frames[i].append(frame) ( self.env, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.env, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = sum( [v[stat_key] for v in stats_episodes.values()]) num_episodes = len(stats_episodes) episode_reward_mean = aggregated_stats["reward"] / num_episodes episode_metric_mean = aggregated_stats[self.metric_uuid] / num_episodes episode_success_mean = aggregated_stats["success"] / num_episodes print(f"Average episode reward: {episode_reward_mean:.6f}") print(f"Average episode success: {episode_success_mean:.6f}") print(f"Average episode {self.metric_uuid}: {episode_metric_mean:.6f}") if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] print("eval_reward", {"average reward": episode_reward_mean}) print( f"eval_{self.metric_uuid}", {f"average {self.metric_uuid}": episode_metric_mean}, ) print("eval_success", {"average success": episode_success_mean}) self.env.close()
def _collect_rollout_step(self, rollouts, current_episode_reward, episode_rewards, episode_counts, episode_dist): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } ( values, actions, actions_log_probs, recurrent_hidden_states, ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.env.step([a[0].item() for a in actions]) # dones = [False] # if outputs['pointgoal_with_gps_compass'][0] < 0.2: # dones = [True] # rewards = -1 * (outputs['pointgoal_with_gps_compass'][0]) # observations = [outputs] observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations) rewards = torch.tensor(rewards, dtype=torch.float, device=episode_rewards.device) rewards = rewards.unsqueeze(1) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=episode_rewards.device, ) current_episode_reward += rewards episode_rewards += (1 - masks) * current_episode_reward episode_counts += 1 - masks current_episode_reward *= masks episode_dist = observations[0]['pointgoal_with_gps_compass'][0] if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, 1, episode_dist
def train(self, env) -> None: r"""Main method for training PPO. Returns: None """ self.env = env ppo_cfg = self.config.RL.PPO self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) if not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) rollouts = RolloutStorage( ppo_cfg.num_steps, self.env.num_envs, self.env.observation_spaces[0], self.env.action_spaces[0], ppo_cfg.hidden_size, ) rollouts.to(self.device) observations = self.env.reset() batch = batch_obs(observations) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None episode_rewards = torch.zeros(self.env.num_envs, 1) episode_counts = torch.zeros(self.env.num_envs, 1) episode_dist = torch.zeros(self.env.num_envs, 1) current_episode_reward = torch.zeros(self.env.num_envs, 1) # window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size) # window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size) window_episode_reward = deque(maxlen=self.config.NUM_UPDATES) window_episode_counts = deque(maxlen=self.config.NUM_UPDATES) dist_val = deque(maxlen=self.config.NUM_UPDATES) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) for update in range(self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, tmp_dist, ) = self._collect_rollout_step( rollouts, current_episode_reward, episode_rewards, episode_counts, episode_dist, ) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps episode_dist = tmp_dist ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time window_episode_reward.append(episode_rewards.clone()) window_episode_counts.append(episode_counts.clone()) dist_val.append(episode_dist) losses = [value_loss, action_loss] stats = zip( ["count", "reward"], [window_episode_counts, window_episode_reward], ) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in stats } deltas["count"] = max(deltas["count"], 1.0) print("reward", deltas["reward"] / deltas["count"], count_steps) print( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: print("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) print("update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) window_rewards = (window_episode_reward[-1] - window_episode_reward[0]).sum() window_counts = (window_episode_counts[-1] - window_episode_counts[0]).sum() if window_counts > 0: print("Average window size {} reward: {:3f}".format( len(window_episode_reward), (window_rewards / window_counts).item(), )) else: print("No episodes finish in current window") # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint(f"ckpt.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 np.savetxt("window_episode_reward_ppo.csv", window_episode_reward, delimiter=",") np.savetxt("window_episode_counts_ppo.csv", window_episode_counts, delimiter=",") np.savetxt("episode_dist_ppo.csv", episode_dist, delimiter=",") self.env.close()
def run(config, env, max_steps): r"""Main method for training PPO. Returns: None """ observations = env.reset() batch = batch_obs(observations) batch = None observations = None episode_rewards = torch.zeros(env.num_envs, 1) episode_counts = torch.zeros(env.num_envs, 1) episode_dist = torch.zeros(env.num_envs, 1) current_episode_reward = torch.zeros(env.num_envs, 1) window_episode_reward = deque(maxlen=max_steps) window_episode_counts = deque(maxlen=max_steps) dist_val = deque(maxlen=max_steps) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 for update in range(max_steps): print(update) reward_sum = 0 dist_sum = 0 iter = 0 rgb_frames = [] if len(config.VIDEO_OPTION) > 0: os.makedirs(config.VIDEO_DIR, exist_ok=True) # get name of performance metric, e.g. "spl" metric_name = config.TASK_CONFIG.TASK.MEASUREMENTS[0] metric_cfg = getattr(config.TASK_CONFIG.TASK, metric_name) measure_type = baseline_registry.get_measure(metric_cfg.TYPE) for step in range(500): dones = [False] while dones[0] == False: outputs = env.step([env.action_spaces[0].sample()]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] reward_sum += rewards[0] dist_sum += observations[0]['pointgoal_with_gps_compass'][0] iter += 1 frame = observations_to_image(observations[0], []) rgb_frames.append(frame) observations = env.reset() window_episode_reward.append(reward_sum / iter) window_episode_counts.append(iter) dist_val.append(dist_sum / iter) generate_video( video_option=config.VIDEO_OPTION, video_dir=config.VIDEO_DIR, images=np.array(rgb_frames), episode_id=update, checkpoint_idx=0, metric_name="spl", metric_value=1.0, ) rgb_frames = [] np.savetxt("window_episode_reward_ppo.csv", window_episode_reward, delimiter=",") np.savetxt("window_episode_counts_ppo.csv", window_episode_counts, delimiter=",") np.savetxt("episode_dist_ppo.csv", episode_dist, delimiter=",") env.close()