def act(self, observations: Observations) -> Dict[str, int]: batch = batch_obs([observations], device=self.device) batch = apply_obs_transforms_batch(batch, self.obs_transforms) with torch.no_grad(): (_, actions, _, self.test_recurrent_hidden_states) = self.actor_critic.act( batch, self.test_recurrent_hidden_states, self.prev_actions, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks.fill_(True) self.prev_actions.copy_(actions) # type: ignore return {"action": actions[0][0].item()}
def act(self, observations): batch = batch_obs([observations], device=self.device) with torch.no_grad(): ( _, actions, _, self.test_recurrent_hidden_states, ) = self.actor_critic.act( batch, self.test_recurrent_hidden_states, self.prev_actions, self.not_done_masks, deterministic=False, ) # Make masks not done till reset (end of episode) will be called self.not_done_masks = torch.ones(1, 1, device=self.device) self.prev_actions.copy_(actions) return {"action": actions[0][0].item()}
def test_cubemap_stiching(test_cfg_path: str, mode: str, camera: str, sensor_type: str): meta_config = get_config(config_paths=test_cfg_path) meta_config.defrost() config = meta_config.TASK_CONFIG CAMERA_NUM = 6 orient = [ [0, math.pi, 0], # Back [-math.pi / 2, 0, 0], # Down [0, 0, 0], # Front [0, math.pi / 2, 0], # Right [0, 3 / 2 * math.pi, 0], # Left [math.pi / 2, 0, 0], # Up ] sensor_uuids = [] if f"{sensor_type}_SENSOR" not in config.SIMULATOR.AGENT_0.SENSORS: config.SIMULATOR.AGENT_0.SENSORS.append(f"{sensor_type}_SENSOR") sensor = getattr(config.SIMULATOR, f"{sensor_type}_SENSOR") for camera_id in range(CAMERA_NUM): camera_template = f"{sensor_type}_{camera_id}" camera_config = deepcopy(sensor) camera_config.ORIENTATION = orient[camera_id] camera_config.UUID = camera_template.lower() sensor_uuids.append(camera_config.UUID) setattr(config.SIMULATOR, camera_template, camera_config) config.SIMULATOR.AGENT_0.SENSORS.append(camera_template) meta_config.TASK_CONFIG = config meta_config.SENSORS = config.SIMULATOR.AGENT_0.SENSORS if camera == "equirect": meta_config.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ.SENSOR_UUIDS = tuple( sensor_uuids) elif camera == "fisheye": meta_config.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.SENSOR_UUIDS = tuple( sensor_uuids) meta_config.freeze() if camera in ["equirect", "fisheye"]: execute_exp(meta_config, mode) # Deinit processes group if torch.distributed.is_initialized(): torch.distributed.destroy_process_group() elif camera == "cubemap": # 1) Generate an equirect image from cubemap images. # 2) Generate cubemap images from the equirect image. # 3) Compare the input and output cubemap env_fn_args = [] for split in ["train", "val"]: tmp_config = config.clone() tmp_config.defrost() tmp_config.DATASET["SPLIT"] = split tmp_config.freeze() env_fn_args.append((tmp_config, None)) with VectorEnv(env_fn_args=env_fn_args) as envs: observations = envs.reset() batch = batch_obs(observations) orig_batch = deepcopy(batch) # ProjectionTransformer obs_trans_to_eq = baseline_registry.get_obs_transformer( "CubeMap2Equirect") cube2equirect = obs_trans_to_eq(sensor_uuids, (256, 512)) obs_trans_to_cube = baseline_registry.get_obs_transformer( "Equirect2CubeMap") equirect2cube = obs_trans_to_cube(cube2equirect.target_uuids, (256, 256)) # Cubemap to Equirect to Cubemap batch_eq = cube2equirect(batch) batch_cube = equirect2cube(batch_eq) # Extract input and output cubemap output_cube = batch_cube[cube2equirect.target_uuids[0]] input_cube = [orig_batch[key] for key in sensor_uuids] input_cube = torch.stack(input_cube, axis=1) input_cube = torch.flatten(input_cube, end_dim=1) # Apply blur to absorb difference (blur, etc.) caused by conversion if sensor_type == "RGB": output_cube = output_cube.float() / 255 input_cube = input_cube.float() / 255 output_cube = output_cube.permute((0, 3, 1, 2)) # NHWC => NCHW input_cube = input_cube.permute((0, 3, 1, 2)) # NHWC => NCHW apply_blur = torch.nn.AvgPool2d(5, 3, 2) output_cube = apply_blur(output_cube) input_cube = apply_blur(input_cube) # Calculate the difference diff = torch.abs(output_cube - input_cube) assert diff.mean().item() < 0.01 else: raise ValueError(f"Unknown camera name: {camera}")
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ if self._is_distributed: raise RuntimeError("Evaluation does not support distributed mode") # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() if config.VERBOSE: logger.info(f"env config: {config}") self._init_envs(config) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) current_episode_reward = torch.zeros( self.envs.num_envs, 1, device="cpu" ) test_recurrent_hidden_states = torch.zeros( self.config.NUM_ENVIRONMENTS, self.actor_critic.net.num_recurrent_layers, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros( self.config.NUM_ENVIRONMENTS, 1, device=self.device, dtype=torch.long, ) not_done_masks = torch.zeros( self.config.NUM_ENVIRONMENTS, 1, device=self.device, dtype=torch.bool, ) stats_episodes: Dict[ Any, Any ] = {} # dict of dicts that stores stats per episode rgb_frames = [ [] for _ in range(self.config.NUM_ENVIRONMENTS) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps pbar = tqdm.tqdm(total=number_of_eval_episodes) self.actor_critic.eval() while ( len(stats_episodes) < number_of_eval_episodes and self.envs.num_envs > 0 ): current_episodes = self.envs.current_episodes() with torch.no_grad(): ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) # type: ignore # NB: Move actions to CPU. If CUDA tensors are # sent in to env.step(), that will create CUDA contexts # in the subprocesses. # For backwards compatibility, we also call .item() to convert to # an int step_data = [a.item() for a in actions.to(device="cpu")] outputs = self.envs.step(step_data) observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache, ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) not_done_masks = torch.tensor( [[not done] for done in dones], dtype=torch.bool, device="cpu", ) rewards = torch.tensor( rewards_l, dtype=torch.float, device="cpu" ).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not not_done_masks[i].item(): pbar.update() episode_stats = {} episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i]) ) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[ ( current_episodes[i].scene_id, current_episodes[i].episode_id, ) ] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: # TODO move normalization / channel changing out of the policy and undo it here frame = observations_to_image( {k: v[i] for k, v in batch.items()}, infos[i] ) rgb_frames[i].append(frame) not_done_masks = not_done_masks.to(device=self.device) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = {} for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum(v[stat_key] for v in stats_episodes.values()) / num_episodes ) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) self.envs.close()
def _collect_environment_result(self, buffer_index: int = 0): num_envs = self.envs.num_envs env_slice = slice( int(buffer_index * num_envs / self._nbuffers), int((buffer_index + 1) * num_envs / self._nbuffers), ) t_step_env = time.time() outputs = [ self.envs.wait_step_at(index_env) for index_env in range(env_slice.start, env_slice.stop) ] observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] self.env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) rewards = torch.tensor( rewards_l, dtype=torch.float, device=self.current_episode_reward.device, ) rewards = rewards.unsqueeze(1) not_done_masks = torch.tensor( [[not done] for done in dones], dtype=torch.bool, device=self.current_episode_reward.device, ) done_masks = torch.logical_not(not_done_masks) self.current_episode_reward[env_slice] += rewards current_ep_reward = self.current_episode_reward[env_slice] self.running_episode_stats["reward"][env_slice] += current_ep_reward.where(done_masks, current_ep_reward.new_zeros(())) # type: ignore self.running_episode_stats["count"][env_slice] += done_masks.float() # type: ignore for k, v_k in self._extract_scalars_from_infos(infos).items(): v = torch.tensor( v_k, dtype=torch.float, device=self.current_episode_reward.device, ).unsqueeze(1) if k not in self.running_episode_stats: self.running_episode_stats[k] = torch.zeros_like( self.running_episode_stats["count"] ) self.running_episode_stats[k][env_slice] += v.where(done_masks, v.new_zeros(())) # type: ignore self.current_episode_reward[env_slice].masked_fill_(done_masks, 0.0) if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) self.rollouts.insert( next_observations=batch, rewards=rewards, next_masks=not_done_masks, buffer_index=buffer_index, ) self.rollouts.advance_rollout(buffer_index) self.pth_time += time.time() - t_update_stats return env_slice.stop - env_slice.start
def _init_train(self): if self.config.RL.DDPPO.force_distributed: self._is_distributed = True if is_slurm_batch_job(): add_signal_handlers() if self._is_distributed: local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend ) if rank0_only(): logger.info( "Initialized DD-PPO with {} workers".format( torch.distributed.get_world_size() ) ) self.config.defrost() self.config.TORCH_GPU_ID = local_rank self.config.SIMULATOR_GPU_ID = local_rank # Multiply by the number of simulators to make sure they also get unique seeds self.config.TASK_CONFIG.SEED += ( torch.distributed.get_rank() * self.config.NUM_ENVIRONMENTS ) self.config.freeze() random.seed(self.config.TASK_CONFIG.SEED) np.random.seed(self.config.TASK_CONFIG.SEED) torch.manual_seed(self.config.TASK_CONFIG.SEED) self.num_rollouts_done_store = torch.distributed.PrefixStore( "rollout_tracker", tcp_store ) self.num_rollouts_done_store.set("num_done", "0") if rank0_only() and self.config.VERBOSE: logger.info(f"config: {self.config}") profiling_wrapper.configure( capture_start_step=self.config.PROFILING.CAPTURE_START_STEP, num_steps_to_capture=self.config.PROFILING.NUM_STEPS_TO_CAPTURE, ) self._init_envs() ppo_cfg = self.config.RL.PPO if torch.cuda.is_available(): self.device = torch.device("cuda", self.config.TORCH_GPU_ID) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") if rank0_only() and not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) if self._is_distributed: self.agent.init_distributed(find_unused_params=True) logger.info( "agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()) ) ) obs_space = self.obs_space if self._static_encoder: self._encoder = self.actor_critic.net.visual_encoder obs_space = spaces.Dict( { "visual_features": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=self._encoder.output_shape, dtype=np.float32, ), **obs_space.spaces, } ) self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1 self.rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, obs_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, is_double_buffered=ppo_cfg.use_double_buffered_sampler, ) self.rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) self.rollouts.buffers["observations"][0] = batch self.current_episode_reward = torch.zeros(self.envs.num_envs, 1) self.running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) self.window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size) ) self.env_time = 0.0 self.pth_time = 0.0 self.t_start = time.time()
def train(self) -> None: r"""Main method for DD-PPO. Returns: None """ self.local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend ) add_signal_handlers() # Stores the number of workers that have finished their rollout num_rollouts_done_store = distrib.PrefixStore( "rollout_tracker", tcp_store ) num_rollouts_done_store.set("num_done", "0") self.world_rank = distrib.get_rank() self.world_size = distrib.get_world_size() self.config.defrost() self.config.TORCH_GPU_ID = self.local_rank self.config.SIMULATOR_GPU_ID = self.local_rank # Multiply by the number of simulators to make sure they also get unique seeds self.config.TASK_CONFIG.SEED += ( self.world_rank * self.config.NUM_PROCESSES ) self.config.freeze() random.seed(self.config.TASK_CONFIG.SEED) np.random.seed(self.config.TASK_CONFIG.SEED) torch.manual_seed(self.config.TASK_CONFIG.SEED) if torch.cuda.is_available(): self.device = torch.device("cuda", self.local_rank) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") self.envs = construct_envs( self.config, get_env_class(self.config.ENV_NAME), workers_ignore_signals=True, ) ppo_cfg = self.config.RL.PPO if ( not os.path.isdir(self.config.CHECKPOINT_FOLDER) and self.world_rank == 0 ): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) self.agent.init_distributed(find_unused_params=True) if self.world_rank == 0: logger.info( "agent number of trainable parameters: {}".format( sum( param.numel() for param in self.agent.parameters() if param.requires_grad ) ) ) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) batch = apply_obs_transforms_batch(batch, self.obs_transforms) obs_space = self.obs_space if self._static_encoder: self._encoder = self.actor_critic.net.visual_encoder obs_space = SpaceDict( { "visual_features": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=self._encoder.output_shape, dtype=np.float32, ), **obs_space.spaces, } ) with torch.no_grad(): batch["visual_features"] = self._encoder(batch) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, obs_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, ) rollouts.to(self.device) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None current_episode_reward = torch.zeros( self.envs.num_envs, 1, device=self.device ) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1, device=self.device), reward=torch.zeros(self.envs.num_envs, 1, device=self.device), ) window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size) ) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 start_update = 0 prev_time = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), ) interrupted_state = load_interrupted_state() if interrupted_state is not None: self.agent.load_state_dict(interrupted_state["state_dict"]) self.agent.optimizer.load_state_dict( interrupted_state["optim_state"] ) lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) requeue_stats = interrupted_state["requeue_stats"] env_time = requeue_stats["env_time"] pth_time = requeue_stats["pth_time"] count_steps = requeue_stats["count_steps"] count_checkpoints = requeue_stats["count_checkpoints"] start_update = requeue_stats["start_update"] prev_time = requeue_stats["prev_time"] with ( TensorboardWriter( self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs ) if self.world_rank == 0 else contextlib.suppress() ) as writer: for update in range(start_update, self.config.NUM_UPDATES): if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES ) if EXIT.is_set(): self.envs.close() if REQUEUE.is_set() and self.world_rank == 0: requeue_stats = dict( env_time=env_time, pth_time=pth_time, count_steps=count_steps, count_checkpoints=count_checkpoints, start_update=update, prev_time=(time.time() - t_start) + prev_time, ) save_interrupted_state( dict( state_dict=self.agent.state_dict(), optim_state=self.agent.optimizer.state_dict(), lr_sched_state=lr_scheduler.state_dict(), config=self.config, requeue_stats=requeue_stats, ) ) requeue_job() return count_steps_delta = 0 self.agent.eval() for step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step( rollouts, current_episode_reward, running_episode_stats ) pth_time += delta_pth_time env_time += delta_env_time count_steps_delta += delta_steps # This is where the preemption of workers happens. If a # worker detects it will be a straggler, it preempts itself! if ( step >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD ) and int(num_rollouts_done_store.get("num_done")) > ( self.config.RL.DDPPO.sync_frac * self.world_size ): break num_rollouts_done_store.add("num_done", 1) self.agent.train() if self._static_encoder: self._encoder.eval() ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time stats_ordering = sorted(running_episode_stats.keys()) stats = torch.stack( [running_episode_stats[k] for k in stats_ordering], 0 ) distrib.all_reduce(stats) for i, k in enumerate(stats_ordering): window_episode_stats[k].append(stats[i].clone()) stats = torch.tensor( [value_loss, action_loss, count_steps_delta], device=self.device, ) distrib.all_reduce(stats) count_steps += stats[2].item() if self.world_rank == 0: num_rollouts_done_store.set("num_done", "0") losses = [ stats[0].item() / self.world_size, stats[1].item() / self.world_size, ] deltas = { k: ( (v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item() ) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar( "reward", deltas["reward"] / deltas["count"], count_steps, ) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info( "update: {}\tfps: {:.3f}\t".format( update, count_steps / ((time.time() - t_start) + prev_time), ) ) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format( update, env_time, pth_time, count_steps ) ) logger.info( "Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join( "{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count" ), ) ) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint( f"ckpt.{count_checkpoints}.pth", dict(step=count_steps), ) count_checkpoints += 1 self.envs.close()
def train(self) -> None: r"""Main method for training PPO. Returns: None """ profiling_wrapper.configure( capture_start_step=self.config.PROFILING.CAPTURE_START_STEP, num_steps_to_capture=self.config.PROFILING.NUM_STEPS_TO_CAPTURE, ) self.envs = construct_envs(self.config, get_env_class(self.config.ENV_NAME)) ppo_cfg = self.config.RL.PPO self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if torch.cuda.is_available() else torch.device("cpu")) if not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()))) rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, self.obs_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, ) rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs(observations, device=self.device) batch = apply_obs_transforms_batch(batch, self.obs_transforms) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) # batch and observations may contain shared PyTorch CUDA # tensors. We must explicitly clear them here otherwise # they will be kept in memory for the entire duration of training! batch = None observations = None current_episode_reward = torch.zeros(self.envs.num_envs, 1) running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) window_episode_stats: DefaultDict[str, deque] = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size)) t_start = time.time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 lr_scheduler = LambdaLR( optimizer=self.agent.optimizer, lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES ), # type: ignore ) with TensorboardWriter(self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs) as writer: for update in range(self.config.NUM_UPDATES): profiling_wrapper.on_start_step() profiling_wrapper.range_push("train update") if ppo_cfg.use_linear_lr_decay: lr_scheduler.step() # type: ignore if ppo_cfg.use_linear_clip_decay: self.agent.clip_param = ppo_cfg.clip_param * linear_decay( update, self.config.NUM_UPDATES) profiling_wrapper.range_push("rollouts loop") for _step in range(ppo_cfg.num_steps): ( delta_pth_time, delta_env_time, delta_steps, ) = self._collect_rollout_step(rollouts, current_episode_reward, running_episode_stats) pth_time += delta_pth_time env_time += delta_env_time count_steps += delta_steps profiling_wrapper.range_pop() # rollouts loop ( delta_pth_time, value_loss, action_loss, dist_entropy, ) = self._update_agent(ppo_cfg, rollouts) pth_time += delta_pth_time for k, v in running_episode_stats.items(): window_episode_stats[k].append(v.clone()) deltas = { k: ((v[-1] - v[0]).sum().item() if len(v) > 1 else v[0].sum().item()) for k, v in window_episode_stats.items() } deltas["count"] = max(deltas["count"], 1.0) writer.add_scalar("reward", deltas["reward"] / deltas["count"], count_steps) # Check to see if there are any metrics # that haven't been logged yet metrics = { k: v / deltas["count"] for k, v in deltas.items() if k not in {"reward", "count"} } if len(metrics) > 0: writer.add_scalars("metrics", metrics, count_steps) losses = [value_loss, action_loss] writer.add_scalars( "losses", {k: l for l, k in zip(losses, ["value", "policy"])}, count_steps, ) # log stats if update > 0 and update % self.config.LOG_INTERVAL == 0: logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time.time() - t_start))) logger.info( "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) logger.info("Average window size: {} {}".format( len(window_episode_stats["count"]), " ".join("{}: {:.3f}".format(k, v / deltas["count"]) for k, v in deltas.items() if k != "count"), )) # checkpoint model if update % self.config.CHECKPOINT_INTERVAL == 0: self.save_checkpoint(f"ckpt.{count_checkpoints}.pth", dict(step=count_steps)) count_checkpoints += 1 profiling_wrapper.range_pop() # train update self.envs.close()
def _collect_rollout_step(self, rollouts, current_episode_reward, running_episode_stats): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } profiling_wrapper.range_push("compute actions") ( values, actions, actions_log_probs, recurrent_hidden_states, ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() # NB: Move actions to CPU. If CUDA tensors are # sent in to env.step(), that will create CUDA contexts # in the subprocesses. # For backwards compatibility, we also call .item() to convert to # an int step_data = [a.item() for a in actions.to(device="cpu")] profiling_wrapper.range_pop() # compute actions outputs = self.envs.step(step_data) observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations, device=self.device) batch = apply_obs_transforms_batch(batch, self.obs_transforms) rewards = torch.tensor(rewards_l, dtype=torch.float, device=current_episode_reward.device) rewards = rewards.unsqueeze(1) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=current_episode_reward.device, ) current_episode_reward += rewards running_episode_stats["reward"] += ( 1 - masks) * current_episode_reward # type: ignore running_episode_stats["count"] += 1 - masks # type: ignore for k, v_k in self._extract_scalars_from_infos(infos).items(): v = torch.tensor(v_k, dtype=torch.float, device=current_episode_reward.device).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"]) running_episode_stats[k] += (1 - masks) * v # type: ignore current_episode_reward *= masks if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() if len(self.config.VIDEO_OPTION) > 0: config.defrost() config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") config.freeze() logger.info(f"env config: {config}") self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg) self.agent.load_state_dict(ckpt_dict["state_dict_agent"]) self.actor_critic = self.agent.actor_critic observations = self.envs.reset() batch = batch_obs(observations, device=self.device) current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device) test_recurrent_hidden_states = torch.zeros( self.actor_critic.net.num_recurrent_layers, self.config.NUM_PROCESSES, ppo_cfg.hidden_size, device=self.device, ) prev_actions = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long) not_done_masks = torch.zeros(self.config.NUM_PROCESSES, 1, device=self.device) stats_episodes = dict() # dict of dicts that stores stats per episode rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES) ] # type: List[List[np.ndarray]] if len(self.config.VIDEO_OPTION) > 0: os.makedirs(self.config.VIDEO_DIR, exist_ok=True) number_of_eval_episodes = self.config.EVAL.EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}.") logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps pbar = tqdm.tqdm(total=number_of_eval_episodes) self.actor_critic.eval() while (len(stats_episodes) < number_of_eval_episodes and self.envs.num_envs > 0): current_episodes = self.envs.current_episodes() with torch.no_grad(): ( _, actions, _, test_recurrent_hidden_states, ) = self.actor_critic.act( batch, test_recurrent_hidden_states, prev_actions, not_done_masks, deterministic=False, ) prev_actions.copy_(actions) outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] batch = batch_obs(observations, device=self.device) not_done_masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=self.device, ) rewards = torch.tensor(rewards, dtype=torch.float, device=self.device).unsqueeze(1) current_episode_reward += rewards next_episodes = self.envs.current_episodes() envs_to_pause = [] n_envs = self.envs.num_envs for i in range(n_envs): if ( next_episodes[i].scene_id, next_episodes[i].episode_id, ) in stats_episodes: envs_to_pause.append(i) # episode ended if not_done_masks[i].item() == 0: pbar.update() episode_stats = dict() episode_stats["reward"] = current_episode_reward[i].item() episode_stats.update( self._extract_scalars_from_info(infos[i])) current_episode_reward[i] = 0 # use scene_id + episode_id as unique id for storing stats stats_episodes[( current_episodes[i].scene_id, current_episodes[i].episode_id, )] = episode_stats if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, ) rgb_frames[i] = [] # episode continues elif len(self.config.VIDEO_OPTION) > 0: frame = observations_to_image(observations[i], infos[i]) rgb_frames[i].append(frame) ( self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) = self._pause_envs( envs_to_pause, self.envs, test_recurrent_hidden_states, not_done_masks, current_episode_reward, prev_actions, batch, rgb_frames, ) num_episodes = len(stats_episodes) aggregated_stats = dict() for stat_key in next(iter(stats_episodes.values())).keys(): aggregated_stats[stat_key] = ( sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes) for k, v in aggregated_stats.items(): logger.info(f"Average episode {k}: {v:.4f}") step_id = checkpoint_index if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: step_id = ckpt_dict["extra_state"]["step"] writer.add_scalars( "eval_reward", {"average reward": aggregated_stats["reward"]}, step_id, ) metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} if len(metrics) > 0: writer.add_scalars("eval_metrics", metrics, step_id) self.envs.close()
def _collect_rollout_step(self, rollouts, current_episode_reward, running_episode_stats): pth_time = 0.0 env_time = 0.0 t_sample_action = time.time() # sample actions with torch.no_grad(): step_observation = { k: v[rollouts.step] for k, v in rollouts.observations.items() } ( values, actions, actions_log_probs, recurrent_hidden_states, ) = self.actor_critic.act( step_observation, rollouts.recurrent_hidden_states[rollouts.step], rollouts.prev_actions[rollouts.step], rollouts.masks[rollouts.step], ) pth_time += time.time() - t_sample_action t_step_env = time.time() outputs = self.envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] env_time += time.time() - t_step_env t_update_stats = time.time() batch = batch_obs(observations, device=self.device) rewards = torch.tensor(rewards, dtype=torch.float, device=current_episode_reward.device) rewards = rewards.unsqueeze(1) masks = torch.tensor( [[0.0] if done else [1.0] for done in dones], dtype=torch.float, device=current_episode_reward.device, ) current_episode_reward += rewards running_episode_stats["reward"] += (1 - masks) * current_episode_reward running_episode_stats["count"] += 1 - masks for k, v in self._extract_scalars_from_infos(infos).items(): v = torch.tensor(v, dtype=torch.float, device=current_episode_reward.device).unsqueeze(1) if k not in running_episode_stats: running_episode_stats[k] = torch.zeros_like( running_episode_stats["count"]) running_episode_stats[k] += (1 - masks) * v current_episode_reward *= masks if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) pth_time += time.time() - t_update_stats return pth_time, env_time, self.envs.num_envs
def evaluate( self, agent: "Agent", num_episodes: Optional[int] = None, ) -> Dict[str, float]: if num_episodes is None: num_episodes = len(self._env.episodes) else: assert num_episodes <= len(self._env.episodes), ( "num_episodes({}) is larger than number of episodes " "in environment ({})".format(num_episodes, len(self._env.episodes))) assert num_episodes > 0, "num_episodes should be greater than 0" agg_metrics: Dict = defaultdict(float) rgb_frames = [] should_render = len(self._video_option) > 0 count_episodes = 0 all_dones = [] all_obs = [] all_next_obs = [] all_actions = [] all_episode_ids = [] traj_obs = [] traj_dones = [] traj_next_obs = [] traj_actions = [] traj_episode_ids = [] pbar = tqdm(total=num_episodes) while count_episodes < num_episodes: observations = self._gym_env.reset() agent.reset() if should_render: rgb_frames.append(self._gym_env.render()) done = False while not done: traj_obs.append(observations) action = agent.act(self._gym_env.orig_obs) traj_actions.append(action) traj_dones.append(False) traj_episode_ids.append( int(self._env.current_episode.episode_id)) observations, _, done, _ = self._gym_env.direct_hab_step( action) traj_next_obs.append(observations) if should_render: rgb_frames.append(self._gym_env.render()) traj_dones[-1] = True if self._should_save_fn is None or self._should_save_fn( self._env.get_metrics()): assert sum(traj_dones) == 1 all_obs.extend(traj_obs) all_dones.extend(traj_dones) all_next_obs.extend(traj_next_obs) all_actions.extend(traj_actions) all_episode_ids.extend(traj_episode_ids) count_episodes += 1 pbar.update(1) traj_obs = [] traj_dones = [] traj_next_obs = [] traj_actions = [] traj_episode_ids = [] metrics = self._env.get_metrics() for m, v in metrics.items(): if isinstance(v, dict): for sub_m, sub_v in v.items(): agg_metrics[m + "/" + str(sub_m)] += sub_v else: agg_metrics[m] += v if should_render: generate_video( video_option=self._video_option, video_dir=self._video_dir, images=rgb_frames, episode_id=self._env.current_episode.episode_id, checkpoint_idx=0, metrics={ k: v for k, v in metrics.items() if k in self._vid_filename_metrics }, tb_writer=self._writer, verbose=False, ) if self._traj_save_path is not None: save_dir = osp.dirname(self._traj_save_path) os.makedirs(save_dir, exist_ok=True) if isinstance(self._gym_env.observation_space, spaces.Dict): all_obs = batch_obs(all_obs) all_next_obs = batch_obs(all_next_obs) torch.save( { "done": torch.FloatTensor(all_dones), "obs": all_obs, "next_obs": all_next_obs, "episode_ids": all_episode_ids, "actions": torch.tensor( [compress_action(action) for action in all_actions]), }, self._traj_save_path, ) print(f"Saved trajectories to {self._traj_save_path}") avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} pbar.close() return avg_metrics