def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) policy = baseline_registry.get_policy(self.config.RL.POLICY.name) self.actor_critic = policy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, ) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def run_exp(exp_config: str, run_type: str, opts=None) -> None: r"""Runs experiment given mode and config Args: exp_config: path to config file. run_type: "train" or "eval. opts: list of strings of additional config options. Returns: None. """ config = get_config(exp_config, opts) logger.info(f"config: {config}") logger.add_filehandler(config.LOG_FILE) random.seed(config.TASK_CONFIG.SEED) np.random.seed(config.TASK_CONFIG.SEED) torch.manual_seed(config.TASK_CONFIG.SEED) torch.backends.cudnn.benchmark = True if run_type == "eval" and config.EVAL.EVAL_NONLEARNING: evaluate_agent(config) return trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" trainer = trainer_init(config) if run_type == "train": trainer.train() elif run_type == "eval": trainer.eval()
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) self.actor_critic = PointNavBaselinePolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, ) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _setup_actor_critic_agent(self, ppo_cfg: Config, ans_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params ans_cfg: config node for ActiveNeuralSLAM model Returns: None """ try: os.mkdir(self.config.TENSORBOARD_DIR) except: pass logger.add_filehandler(os.path.join(self.config.TENSORBOARD_DIR, "run.log")) occ_cfg = ans_cfg.OCCUPANCY_ANTICIPATOR mapper_cfg = ans_cfg.MAPPER # Create occupancy anticipation model occupancy_model = OccupancyAnticipator(occ_cfg) occupancy_model = OccupancyAnticipationWrapper( occupancy_model, mapper_cfg.map_size, (128, 128) ) # Create ANS model self.ans_net = ActiveNeuralSLAMNavigator(ans_cfg, occupancy_model) self.mapper = self.ans_net.mapper self.local_actor_critic = self.ans_net.local_policy # Create depth projection model to estimate visible occupancy self.depth_projection_net = DepthProjectionNet( ans_cfg.OCCUPANCY_ANTICIPATOR.EGO_PROJECTION ) # Set to device self.mapper.to(self.device) self.local_actor_critic.to(self.device) self.depth_projection_net.to(self.device)
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) policy = baseline_registry.get_policy(self.config.RL.POLICY.name) observation_space = self.obs_space self.obs_transforms = get_active_obs_transforms(self.config) observation_space = apply_obs_transforms_obs_space( observation_space, self.obs_transforms) self.actor_critic = policy.from_config(self.config, observation_space, self.envs.action_spaces[0]) self.obs_space = observation_space self.actor_critic.to(self.device) if (self.config.RL.DDPPO.pretrained_encoder or self.config.RL.DDPPO.pretrained): pretrained_state = torch.load( self.config.RL.DDPPO.pretrained_weights, map_location="cpu") if self.config.RL.DDPPO.pretrained: self.actor_critic.load_state_dict({ k[len("actor_critic."):]: v for k, v in pretrained_state["state_dict"].items() }) elif self.config.RL.DDPPO.pretrained_encoder: prefix = "actor_critic.net.visual_encoder." self.actor_critic.net.visual_encoder.load_state_dict({ k[len(prefix):]: v for k, v in pretrained_state["state_dict"].items() if k.startswith(prefix) }) if not self.config.RL.DDPPO.train_encoder: self._static_encoder = True for param in self.actor_critic.net.visual_encoder.parameters(): param.requires_grad_(False) if self.config.RL.DDPPO.reset_critic: nn.init.orthogonal_(self.actor_critic.critic.fc.weight) nn.init.constant_(self.actor_critic.critic.fc.bias, 0) self.agent = (DDPPO if self._is_distributed else PPO)( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) model_cfg = self.config.MODEL model_cfg.defrost() model_cfg.TORCH_GPU_ID = self.config.TORCH_GPU_ID model_cfg.freeze() assert model_cfg.POLICY in SUPPORTED_POLICIES, \ f"{model_cfg.POLICY} not in {SUPPORTED_POLICIES}" if model_cfg.POLICY == "seq2seq": self.actor_critic = Seq2SeqPolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], model_config=self.config.MODEL, ) elif model_cfg.POLICY == "cma": self.actor_critic = CMAPolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], model_config=self.config.MODEL, ) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, ) if self.config.LOAD_FROM_CKPT: ckpt_dict = self.load_checkpoint(self.config.LOAD_CKPT_PATH, map_location="cpu") self.actor_critic.load_state_dict(ckpt_dict["state_dict_ac"]) self.agent.load_state_dict(ckpt_dict["state_dict_agent"]) logger.info( f"Loaded weights from checkpoint: {self.config.LOAD_CKPT_PATH}" ) logger.info("Finished setting up actor critic model.")
def _setup_dqn_agent(self, ppo_cfg: Config, task_cfg: Config, aux_cfg: Config = None, aux_tasks=[]) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) if ppo_cfg.policy not in POLICY_CLASSES: raise Exception(f"Illegal policy {ppo_cfg.policy} provided. Valid policies are {POLICY_CLASSES.keys()}") if len(aux_tasks) != 0 and len(aux_tasks) != len(aux_cfg.tasks): raise Exception(f"Policy specifies {len(aux_cfg.tasks)} tasks but {len(aux_tasks)} were initialized.") policy_class = POLICY_CLASSES[ppo_cfg.policy] # Default policy settings for object nav is_objectnav = "ObjectNav" in task_cfg.TYPE additional_sensors = [] embed_goal = False if is_objectnav: additional_sensors = ["gps", "compass"] embed_goal = True self.q_network = QNetwork( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, num_heads=ppo_cfg.num_heads, goal_sensor_uuid=task_cfg.GOAL_SENSOR_UUID, num_tasks=len(aux_cfg.tasks), # we pass this is in to support eval, where no aux modules are made additional_sensors=additional_sensors, ).to(self.device) self.q_network_target = QNetwork( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, num_heads=ppo_cfg.num_heads, goal_sensor_uuid=task_cfg.GOAL_SENSOR_UUID, num_tasks=len(aux_cfg.tasks), # we pass this is in to support eval, where no aux modules are made additional_sensors=additional_sensors, ).to(self.device) self.q_network_target.eval() self.optimizer = optim.Adam( list(filter(lambda p: p.requires_grad, self.q_network.parameters())), lr=ppo_cfg.lr, eps=ppo_cfg.eps, ) self.sync_model()
def _setup_actor_critic_agent(self, ppo_cfg: Config, train: bool=True) \ -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ # TODO Ugly for YOLO TO work # torch.backends.cudnn.enabled = True torch.cuda.set_device(self.device.index) # Get object index logger.add_filehandler(self.config.LOG_FILE) # First pass add rollouts detector_features memory self.envs.observation_spaces[0].spaces["detector_features"] = \ spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=(765 // (3 * 3), 32, 32), dtype=np.float32, ) # generate feature convertor to Yolo class self.detector_class_select = YoloDetector.class_selector() self.actor_critic = ObjectClassNavBaselinePolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, detector_config=self.config.DETECTOR, device=self.device ) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, )
def _setup_anticipator(self, ppo_cfg: Config, ans_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params ans_cfg: config node for ActiveNeuralSLAM model Returns: None """ try: os.mkdir(self.config.TENSORBOARD_DIR) except: pass logger.add_filehandler(os.path.join(self.config.TENSORBOARD_DIR, "run.log")) sem_cfg = ans_cfg.SEMANTIC_ANTICIPATOR mapper_cfg = ans_cfg.MAPPER # Create occupancy anticipation model [imgh, imgw] = ans_cfg['image_scale_hw'] sem_model = SemAnticipationWrapper( SemAnticipator(sem_cfg), mapper_cfg.map_size, (imgh, imgw) ) self.mapper = Mapper(mapper_cfg,sem_model) self.mapper_agent = MapUpdate( self.mapper, lr=mapper_cfg.lr, eps=mapper_cfg.eps, label_id=mapper_cfg.label_id, max_grad_norm=mapper_cfg.max_grad_norm, pose_loss_coef=mapper_cfg.pose_loss_coef, semantic_anticipator_type=ans_cfg.SEMANTIC_ANTICIPATOR.type, freeze_projection_unit=mapper_cfg.freeze_projection_unit, num_update_batches=mapper_cfg.num_update_batches, batch_size=mapper_cfg.map_batch_size, mapper_rollouts=self.mapper_rollouts, ) if ans_cfg.model_path != "": self.resume_checkpoint(ans_cfg.model_path)
def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) if observation_space is None: observation_space = self.envs.observation_spaces[0] self.actor_critic = AudioNavBaselinePolicy( observation_space=observation_space, hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, masking=self.config.MASKING, encode_rgb=self.config.ENCODE_RGB, encode_depth=self.config.ENCODE_DEPTH, action_map_size=self.config.TASK_CONFIG.TASK.ACTION_MAP.MAP_SIZE) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, )
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) #print(ppo_cfg) # consolidate all net hyperparam in net_args # alternative would be to modify this method # which would cause further modification in this class ppo_cfg.net_args.hidden_size = ppo_cfg.hidden_size self.actor_critic = PreTrainedPointNavPolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.net_args, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, ) self.actor_critic.to(self.device) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _eval_checkpoint( self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, ) -> None: r"""Evaluates a single checkpoint. Args: checkpoint_path: path of checkpoint writer: tensorboard writer object for logging to tensorboard checkpoint_index: index of cur checkpoint for logging Returns: None """ # Map location CPU is almost always better than mapping to a CUDA device. ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") if self.config.EVAL.USE_CKPT_CONFIG: config = self._setup_eval_config(ckpt_dict["config"]) else: config = self.config.clone() ppo_cfg = config.RL.PPO ans_cfg = config.RL.ANS config.defrost() config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT config.freeze() self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) self._setup_actor_critic_agent(ppo_cfg, ans_cfg) # Convert the state_dict of mapper_agent to mapper mapper_dict = { k.replace("mapper.", ""): v for k, v in ckpt_dict["mapper_state_dict"].items() } # Converting the state_dict of local_agent to just the local_policy. local_dict = { k.replace("actor_critic.", ""): v for k, v in ckpt_dict["local_state_dict"].items() } # Strict = False is set to ignore to handle the case where # pose_estimator is not required. self.mapper.load_state_dict(mapper_dict, strict=False) self.local_actor_critic.load_state_dict(local_dict) # Set models to evaluation self.mapper.eval() self.local_actor_critic.eval() number_of_eval_episodes = self.config.TEST_EPISODE_COUNT if number_of_eval_episodes == -1: number_of_eval_episodes = sum(self.envs.number_of_episodes) else: total_num_eps = sum(self.envs.number_of_episodes) if total_num_eps < number_of_eval_episodes: logger.warn( f"Config specified {number_of_eval_episodes} eval episodes" ", dataset only has {total_num_eps}." ) logger.warn(f"Evaluating with {total_num_eps} instead.") number_of_eval_episodes = total_num_eps M = ans_cfg.overall_map_size V = ans_cfg.MAPPER.map_size s = ans_cfg.MAPPER.map_scale imH, imW = ans_cfg.image_scale_hw assert ( self.envs.num_envs == 1 ), "Number of environments needs to be 1 for evaluation" # Define metric accumulators # Navigation metrics navigation_metrics = { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), } per_difficulty_navigation_metrics = { "easy": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, "medium": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, "hard": { "success_rate": Metric(), "spl": Metric(), "distance_to_goal": Metric(), "time": Metric(), "softspl": Metric(), }, } times_per_episode = deque() times_per_step = deque() # Define a simple function to return episode difficulty based on # the geodesic distance def classify_difficulty(gd): if gd < 5.0: return "easy" elif gd < 10.0: return "medium" else: return "hard" eval_start_time = time.time() # Reset environments only for the very first batch observations = self.envs.reset() for ep in range(number_of_eval_episodes): # ============================== Reset agent ============================== # Reset agent states state_estimates = { "pose_estimates": torch.zeros(self.envs.num_envs, 3).to(self.device), "map_states": torch.zeros(self.envs.num_envs, 2, M, M).to(self.device), "recurrent_hidden_states": torch.zeros( 1, self.envs.num_envs, ans_cfg.LOCAL_POLICY.hidden_size ).to(self.device), } # Reset ANS states self.ans_net.reset() self.not_done_masks = torch.zeros(self.envs.num_envs, 1, device=self.device) self.prev_actions = torch.zeros(self.envs.num_envs, 1, device=self.device) self.prev_batch = None self.ep_time = torch.zeros(self.envs.num_envs, 1, device=self.device) # =========================== Episode loop ================================ ep_start_time = time.time() current_episodes = self.envs.current_episodes() for ep_step in range(self.config.T_MAX): step_start_time = time.time() # ============================ Action step ============================ batch = self._prepare_batch(observations) if self.prev_batch is None: self.prev_batch = copy.deepcopy(batch) prev_pose_estimates = state_estimates["pose_estimates"] with torch.no_grad(): ( _, _, mapper_outputs, local_policy_outputs, state_estimates, ) = self.ans_net.act( batch, self.prev_batch, state_estimates, self.ep_time, self.not_done_masks, deterministic=ans_cfg.LOCAL_POLICY.deterministic_flag, ) actions = local_policy_outputs["actions"] # Make masks not done till reset (end of episode) self.not_done_masks = torch.ones( self.envs.num_envs, 1, device=self.device ) self.prev_actions.copy_(actions) if ep_step == 0: state_estimates["pose_estimates"].copy_(prev_pose_estimates) self.ep_time += 1 # Update prev batch for k, v in batch.items(): self.prev_batch[k].copy_(v) # Remap actions from exploration to navigation agent. actions_rmp = self._remap_actions(actions) # =========================== Environment step ======================== outputs = self.envs.step([a[0].item() for a in actions_rmp]) observations, _, dones, infos = [list(x) for x in zip(*outputs)] times_per_step.append(time.time() - step_start_time) # ============================ Process metrics ======================== if dones[0]: times_per_episode.append(time.time() - ep_start_time) mins_per_episode = np.mean(times_per_episode).item() / 60.0 eta_completion = mins_per_episode * ( number_of_eval_episodes - ep - 1 ) secs_per_step = np.mean(times_per_step).item() for i in range(self.envs.num_envs): episode_id = int(current_episodes[i].episode_id) curr_metrics = { "spl": infos[i]["spl"], "softspl": infos[i]["softspl"], "success_rate": infos[i]["success"], "time": ep_step + 1, "distance_to_goal": infos[i]["distance_to_goal"], } # Estimate difficulty of episode episode_difficulty = classify_difficulty( current_episodes[i].info["geodesic_distance"] ) for k, v in curr_metrics.items(): navigation_metrics[k].update(v, 1.0) per_difficulty_navigation_metrics[episode_difficulty][ k ].update(v, 1.0) logger.info(f"====> {ep}/{number_of_eval_episodes} done") for k, v in curr_metrics.items(): logger.info(f"{k:25s} : {v:10.3f}") logger.info("{:25s} : {:10d}".format("episode_id", episode_id)) logger.info(f"Time per episode: {mins_per_episode:.3f} mins") logger.info(f"Time per step: {secs_per_step:.3f} secs") logger.info(f"ETA: {eta_completion:.3f} mins") # For navigation, terminate episode loop when dones is called break # done-for if checkpoint_index == 0: try: eval_ckpt_idx = self.config.EVAL_CKPT_PATH_DIR.split("/")[-1].split( "." )[1] logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_final_{eval_ckpt_idx}.txt" ) except: logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_{checkpoint_index}.txt" ) else: logger.add_filehandler( f"{self.config.TENSORBOARD_DIR}/navigation_results_ckpt_{checkpoint_index}.txt" ) logger.info( f"======= Evaluating over {number_of_eval_episodes} episodes =============" ) logger.info(f"=======> Navigation metrics") for k, v in navigation_metrics.items(): logger.info(f"{k}: {v.get_metric():.3f}") writer.add_scalar(f"navigation/{k}", v.get_metric(), checkpoint_index) for diff, diff_metrics in per_difficulty_navigation_metrics.items(): logger.info(f"=============== {diff:^10s} metrics ==============") for k, v in diff_metrics.items(): logger.info(f"{k}: {v.get_metric():.3f}") writer.add_scalar( f"{diff}_navigation/{k}", v.get_metric(), checkpoint_index ) total_eval_time = (time.time() - eval_start_time) / 60.0 logger.info(f"Total evaluation time: {total_eval_time:.3f} mins") self.envs.close()
def main(): parser = ppo_args() args = parser.parse_args() random.seed(args.seed) device = torch.device("cuda:{}".format(args.pth_gpu_id)) logger.add_filehandler(args.log_file) if not os.path.isdir(args.checkpoint_folder): os.makedirs(args.checkpoint_folder) for p in sorted(list(vars(args))): logger.info("{}: {}".format(p, getattr(args, p))) envs = construct_envs(args) actor_critic = Policy( observation_space=envs.observation_spaces[0], action_space=envs.action_spaces[0], hidden_size=args.hidden_size, ) actor_critic.to(device) agent = PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) logger.info("agent number of parameters: {}".format( sum(param.numel() for param in agent.parameters()))) observations = envs.reset() batch = batch_obs(observations) rollouts = RolloutStorage( args.num_steps, envs.num_envs, envs.observation_spaces[0], envs.action_spaces[0], args.hidden_size, ) for sensor in rollouts.observations: rollouts.observations[sensor][0].copy_(batch[sensor]) rollouts.to(device) episode_rewards = torch.zeros(envs.num_envs, 1) episode_counts = torch.zeros(envs.num_envs, 1) current_episode_reward = torch.zeros(envs.num_envs, 1) window_episode_reward = deque() window_episode_counts = deque() t_start = time() env_time = 0 pth_time = 0 count_steps = 0 count_checkpoints = 0 for update in range(args.num_updates): if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, update, args.num_updates, args.lr) agent.clip_param = args.clip_param * (1 - update / args.num_updates) for step in range(args.num_steps): t_sample_action = time() # sample actions with torch.no_grad(): step_observation = { k: v[step] for k, v in rollouts.observations.items() } ( values, actions, actions_log_probs, recurrent_hidden_states, ) = actor_critic.act( step_observation, rollouts.recurrent_hidden_states[step], rollouts.masks[step], ) pth_time += time() - t_sample_action t_step_env = time() outputs = envs.step([a[0].item() for a in actions]) observations, rewards, dones, infos = [ list(x) for x in zip(*outputs) ] env_time += time() - t_step_env t_update_stats = time() batch = batch_obs(observations) rewards = torch.tensor(rewards, dtype=torch.float) rewards = rewards.unsqueeze(1) masks = torch.tensor([[0.0] if done else [1.0] for done in dones], dtype=torch.float) current_episode_reward += rewards episode_rewards += (1 - masks) * current_episode_reward episode_counts += 1 - masks current_episode_reward *= masks rollouts.insert( batch, recurrent_hidden_states, actions, actions_log_probs, values, rewards, masks, ) count_steps += envs.num_envs pth_time += time() - t_update_stats if len(window_episode_reward) == args.reward_window_size: window_episode_reward.popleft() window_episode_counts.popleft() window_episode_reward.append(episode_rewards.clone()) window_episode_counts.append(episode_counts.clone()) t_update_model = time() with torch.no_grad(): last_observation = { k: v[-1] for k, v in rollouts.observations.items() } next_value = actor_critic.get_value( last_observation, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() pth_time += time() - t_update_model # log stats if update > 0 and update % args.log_interval == 0: logger.info("update: {}\tfps: {:.3f}\t".format( update, count_steps / (time() - t_start))) logger.info("update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" "frames: {}".format(update, env_time, pth_time, count_steps)) window_rewards = (window_episode_reward[-1] - window_episode_reward[0]).sum() window_counts = (window_episode_counts[-1] - window_episode_counts[0]).sum() if window_counts > 0: logger.info("Average window size {} reward: {:3f}".format( len(window_episode_reward), (window_rewards / window_counts).item(), )) else: logger.info("No episodes finish in current window") # checkpoint model if update % args.checkpoint_interval == 0: checkpoint = {"state_dict": agent.state_dict()} torch.save( checkpoint, os.path.join( args.checkpoint_folder, "ckpt.{}.pth".format(count_checkpoints), ), ) count_checkpoints += 1
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) self.actor_critic = eval(self.config.POLICY)( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, rnn_type=ppo_cfg.rnn_type, num_recurrent_layers=ppo_cfg.num_recurrent_layers, backbone=ppo_cfg.backbone, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, normalize_visual_inputs="rgb" in self.envs.observation_spaces[0].spaces, cfg=self.config) #print('## Deleted visual input normalization-----------------------------------------') self.actor_critic.to(self.device) if ppo_cfg.pretrained_encoder or ppo_cfg.pretrained or ppo_cfg.il_pretrained: pretrained_state = torch.load(ppo_cfg.pretrained_weights, map_location="cpu") #TODO: edit this code to load your ckpt model if ppo_cfg.pretrained: try: self.actor_critic.load_state_dict({ k[len("actor_critic."):]: v for k, v in pretrained_state["state_dict"].items() }) self.resume_steps = pretrained_state['extra_state']['step'] except: raise initial_state_dict = self.actor_critic.state_dict() initial_state_dict.update({ k[len("actor_critic."):]: v for k, v in pretrained_state['state_dict'].items() if k[len("actor_critic."):] in initial_state_dict and v. shape == initial_state_dict[k[len("actor_critic."):]].shape }) print({ k[len("actor_critic."):]: v for k, v in pretrained_state['state_dict'].items() if k[len("actor_critic."):] in initial_state_dict and v. shape == initial_state_dict[k[len("actor_critic."):]].shape }.keys()) self.actor_critic.load_state_dict(initial_state_dict) print('###############loaded state dict selectively') elif ppo_cfg.pretrained_encoder: try: prefix = "actor_critic.net.visual_encoder." self.actor_critic.net.visual_encoder.load_state_dict({ k[len(prefix):]: v for k, v in pretrained_state["state_dict"].items() if k.startswith(prefix) }) print('loaded pretrained visual encoder') except: prefix = "visual_encoder." initial_state_dict = self.actor_critic.net.visual_encoder.state_dict( ) initial_state_dict.update({ k[len(prefix):]: v for k, v in pretrained_state.items() if k.startswith(prefix) }) self.actor_critic.net.visual_encoder.load_state_dict( initial_state_dict) print('loaded pretrained visual encoder ', ppo_cfg.pretrained_weights) elif ppo_cfg.il_pretrained: pretrained_state = pretrained_state['state_dict'] self.actor_critic.load_state_dict(pretrained_state) self.resume_steps = 0 print('il pretrained checkpoint loaded') if not ppo_cfg.train_encoder: self._static_encoder = True for param in self.actor_critic.net.visual_encoder.parameters(): param.requires_grad_(False) if ppo_cfg.reset_critic: nn.init.orthogonal_(self.actor_critic.critic.fc.weight) nn.init.constant_(self.actor_critic.critic.fc.bias, 0) self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _setup_actor_critic_agent(self, ppo_cfg: Config, task_cfg: Config, aux_cfg: Config = None, aux_tasks=[]) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) if ppo_cfg.policy not in POLICY_CLASSES: raise Exception( f"Illegal policy {ppo_cfg.policy} provided. Valid policies are {POLICY_CLASSES.keys()}" ) if len(aux_tasks) != 0 and len(aux_tasks) != len(aux_cfg.tasks): raise Exception( f"Policy specifies {len(aux_cfg.tasks)} tasks but {len(aux_tasks)} were initialized." ) policy_class = POLICY_CLASSES[ppo_cfg.policy] # Default policy settings for object nav is_objectnav = "ObjectNav" in task_cfg.TYPE additional_sensors = [] embed_goal = False if is_objectnav: additional_sensors = ["gps", "compass"] embed_goal = True self.actor_critic = policy_class( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, aux_tasks=aux_tasks, goal_sensor_uuid=task_cfg.GOAL_SENSOR_UUID, num_tasks=len( aux_cfg.tasks ), # we pass this is in to support eval, where no aux modules are made additional_sensors=additional_sensors, embed_goal=embed_goal, device=self.device, config=ppo_cfg.POLICY).to(self.device) self.agent = self.get_ppo_class()( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, aux_loss_coef=ppo_cfg.aux_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, aux_tasks=aux_tasks, aux_cfg=aux_cfg, use_normalized_advantage=ppo_cfg.use_normalized_advantage, ).to(self.device)
def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) if observation_space is None: observation_space = self.envs.observation_spaces[0] if not ppo_cfg.use_external_memory: self.actor_critic = AudioNavBaselinePolicy( observation_space=observation_space, action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, extra_rgb=self.config.EXTRA_RGB) else: smt_cfg = ppo_cfg.SCENE_MEMORY_TRANSFORMER self.actor_critic = AudioNavSMTPolicy( observation_space=observation_space, action_space=self.envs.action_spaces[0], hidden_size=smt_cfg.hidden_size, nhead=smt_cfg.nhead, num_encoder_layers=smt_cfg.num_encoder_layers, num_decoder_layers=smt_cfg.num_decoder_layers, dropout=smt_cfg.dropout, activation=smt_cfg.activation, use_pretrained=smt_cfg.use_pretrained, pretrained_path=smt_cfg.pretrained_path, use_belief_as_goal=ppo_cfg.use_belief_predictor, use_label_belief=smt_cfg.use_label_belief, use_location_belief=smt_cfg.use_location_belief) if ppo_cfg.use_belief_predictor: belief_cfg = ppo_cfg.BELIEF_PREDICTOR smt = self.actor_critic.net.smt_state_encoder self.belief_predictor = BeliefPredictor( belief_cfg, self.device, smt._input_size, smt._pose_indices, smt.hidden_state_size, self.envs.num_envs, ).to(device=self.device) for param in self.belief_predictor.parameters(): param.requires_grad = False self.agent = PPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, ) if self.config.RESUME: ckpt_dict = self.load_checkpoint( 'data/models/smt_with_pose/ckpt.400.pth', map_location="cpu") self.agent.actor_critic.net.visual_encoder.load_state_dict( self.search_dict(ckpt_dict, 'visual_encoder')) self.agent.actor_critic.net.goal_encoder.load_state_dict( self.search_dict(ckpt_dict, 'goal_encoder')) self.agent.actor_critic.net.action_encoder.load_state_dict( self.search_dict(ckpt_dict, 'action_encoder')) if ppo_cfg.use_external_memory and smt_cfg.freeze_encoders: self._static_smt_encoder = True self.actor_critic.net.freeze_encoders() self.actor_critic.to(self.device)
def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: r"""Sets up actor critic and agent for DD-PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) self.actor_critic = PointNavResNetPolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, rnn_type=self.config.RL.DDPPO.rnn_type, num_recurrent_layers=self.config.RL.DDPPO.num_recurrent_layers, backbone=self.config.RL.DDPPO.backbone, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, normalize_visual_inputs="rgb" in self.envs.observation_spaces[0].spaces, ) self.actor_critic.to(self.device) if (self.config.RL.DDPPO.pretrained_encoder or self.config.RL.DDPPO.pretrained): pretrained_state = torch.load( self.config.RL.DDPPO.pretrained_weights, map_location="cpu") if self.config.RL.DDPPO.pretrained: self.actor_critic.load_state_dict({ k[len("actor_critic."):]: v for k, v in pretrained_state["state_dict"].items() }) elif self.config.RL.DDPPO.pretrained_encoder: prefix = "actor_critic.net.visual_encoder." self.actor_critic.net.visual_encoder.load_state_dict({ k[len(prefix):]: v for k, v in pretrained_state["state_dict"].items() if k.startswith(prefix) }) if not self.config.RL.DDPPO.train_encoder: self._static_encoder = True for param in self.actor_critic.net.visual_encoder.parameters(): param.requires_grad_(False) if self.config.RL.DDPPO.reset_critic: nn.init.orthogonal_(self.actor_critic.critic.fc.weight) nn.init.constant_(self.actor_critic.critic.fc.bias, 0) self.agent = DDPPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: r"""Sets up actor critic and agent for DD-PPO. Args: ppo_cfg: config node with relevant params Returns: None """ logger.add_filehandler(self.config.LOG_FILE) action_space = self.envs.action_spaces[0] self.action_space = action_space has_distractor_sound = self.config.TASK_CONFIG.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND if ppo_cfg.policy_type == 'rnn': self.actor_critic = AudioNavBaselinePolicy( observation_space=self.envs.observation_spaces[0], action_space=self.action_space, hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, extra_rgb=self.config.EXTRA_RGB, use_mlp_state_encoder=ppo_cfg.use_mlp_state_encoder) if ppo_cfg.use_belief_predictor: belief_cfg = ppo_cfg.BELIEF_PREDICTOR bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor self.belief_predictor = bp_class( belief_cfg, self.device, None, None, ppo_cfg.hidden_size, self.envs.num_envs, has_distractor_sound).to(device=self.device) if belief_cfg.online_training: params = list(self.belief_predictor.predictor.parameters()) if belief_cfg.train_encoder: params += list(self.actor_critic.net.goal_encoder.parameters()) + \ list(self.actor_critic.net.visual_encoder.parameters()) + \ list(self.actor_critic.net.action_encoder.parameters()) self.belief_predictor.optimizer = torch.optim.Adam( params, lr=belief_cfg.lr) self.belief_predictor.freeze_encoders() elif ppo_cfg.policy_type == 'smt': smt_cfg = ppo_cfg.SCENE_MEMORY_TRANSFORMER belief_cfg = ppo_cfg.BELIEF_PREDICTOR self.actor_critic = AudioNavSMTPolicy( observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=smt_cfg.hidden_size, nhead=smt_cfg.nhead, num_encoder_layers=smt_cfg.num_encoder_layers, num_decoder_layers=smt_cfg.num_decoder_layers, dropout=smt_cfg.dropout, activation=smt_cfg.activation, use_pretrained=smt_cfg.use_pretrained, pretrained_path=smt_cfg.pretrained_path, pretraining=smt_cfg.pretraining, use_belief_encoding=smt_cfg.use_belief_encoding, use_belief_as_goal=ppo_cfg.use_belief_predictor, use_label_belief=belief_cfg.use_label_belief, use_location_belief=belief_cfg.use_location_belief, normalize_category_distribution=belief_cfg. normalize_category_distribution, use_category_input=has_distractor_sound) if smt_cfg.freeze_encoders: self._static_smt_encoder = True self.actor_critic.net.freeze_encoders() if ppo_cfg.use_belief_predictor: smt = self.actor_critic.net.smt_state_encoder bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor self.belief_predictor = bp_class( belief_cfg, self.device, smt._input_size, smt._pose_indices, smt.hidden_state_size, self.envs.num_envs, has_distractor_sound).to(device=self.device) if belief_cfg.online_training: params = list(self.belief_predictor.predictor.parameters()) if belief_cfg.train_encoder: params += list(self.actor_critic.net.goal_encoder.parameters()) + \ list(self.actor_critic.net.visual_encoder.parameters()) + \ list(self.actor_critic.net.action_encoder.parameters()) self.belief_predictor.optimizer = torch.optim.Adam( params, lr=belief_cfg.lr) self.belief_predictor.freeze_encoders() else: raise ValueError( f'Policy type {ppo_cfg.policy_type} is not defined!') self.actor_critic.to(self.device) if self.config.RL.DDPPO.pretrained: # load weights for both actor critic and the encoder pretrained_state = torch.load( self.config.RL.DDPPO.pretrained_weights, map_location="cpu") self.actor_critic.load_state_dict( { k[len("actor_critic."):]: v for k, v in pretrained_state["state_dict"].items() if "actor_critic.net.visual_encoder" not in k and "actor_critic.net.smt_state_encoder" not in k }, strict=False) self.actor_critic.net.visual_encoder.rgb_encoder.load_state_dict( { k[len("actor_critic.net.visual_encoder.rgb_encoder."):]: v for k, v in pretrained_state["state_dict"].items() if "actor_critic.net.visual_encoder.rgb_encoder." in k }, ) self.actor_critic.net.visual_encoder.depth_encoder.load_state_dict( { k[len("actor_critic.net.visual_encoder.depth_encoder."):]: v for k, v in pretrained_state["state_dict"].items() if "actor_critic.net.visual_encoder.depth_encoder." in k }, ) if self.config.RL.DDPPO.reset_critic: nn.init.orthogonal_(self.actor_critic.critic.fc.weight) nn.init.constant_(self.actor_critic.critic.fc.bias, 0) self.agent = DDPPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, use_normalized_advantage=ppo_cfg.use_normalized_advantage, )
def _setup_actor_critic_agent(self, ppo_cfg: Config, train: bool = True) -> None: r"""Sets up actor critic and agent for PPO. Args: ppo_cfg: config node with relevant params Returns: None """ cfg = self.config self._live_view_env = cfg.LIVE_VIEW_ENV # Get object index logger.add_filehandler(cfg.LOG_FILE) self.prev_pos = [] # -- Reachability stuff # First pass add rollouts detector_features memory train_reachability = cfg.RL.REACHABILITY.train self.r_enabled = cfg.RL.REACHABILITY.enabled if self.r_enabled: self.r_policy = ReachabilityPolicy( cfg.RL.REACHABILITY, self.envs.num_envs, self.envs.observation_spaces[0], device=self.device, with_training=train_reachability, tb_dir=cfg.TENSORBOARD_DIR) # type: torch.nn.Module self.r_policy.to(self.device) else: self.r_policy = None # Add only intrinsic reward self.only_intrinsic_reward = cfg.RL.REACHABILITY.only_intrinsic_reward # Train PPO after rtrain self.skip_train_ppo_without_rtrain = \ cfg.RL.REACHABILITY.skip_train_ppo_without_rtrain # Map output of aux prediction from actor critic to next step observation self.map_aux_to_obs = cfg.RL.PPO.actor_critic.map_aux_to_obs self.actor_critic = ACTOR_CRITICS[cfg.RL.PPO.actor_critic.type]( cfg=cfg.RL.PPO.actor_critic, observation_space=self.envs.observation_spaces[0], action_space=self.envs.action_spaces[0], hidden_size=ppo_cfg.hidden_size, goal_sensor_uuid=cfg.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, with_target_encoding=cfg.TASK_CONFIG.TASK.WITH_TARGET_ENCODING, device=self.device, reachability_policy=self.r_policy, visual_encoder=ppo_cfg.visual_encoder, drop_prob=ppo_cfg.visual_encoder_dropout, channel_scale=ppo_cfg.channel_scale, ) self.actor_critic.to(self.device) self.actor_critic.map_aux_to_obs = self.map_aux_to_obs for aux in self.actor_critic.net.aux_models.values(): if getattr(aux, "master", False): aux.set_trainer(self) self.agent = AuxPPO( actor_critic=self.actor_critic, clip_param=ppo_cfg.clip_param, ppo_epoch=ppo_cfg.ppo_epoch, num_mini_batch=ppo_cfg.num_mini_batch, value_loss_coef=ppo_cfg.value_loss_coef, entropy_coef=ppo_cfg.entropy_coef, action_loss_coef=ppo_cfg.action_loss_coef, lr=ppo_cfg.lr, eps=ppo_cfg.eps, max_grad_norm=ppo_cfg.max_grad_norm, ) # type: AuxPPO