def forward(self, inputs, rnn_hxs, masks): x_rgb = inputs["rgb_at_t"] x_rgb = process_image(x_rgb, self.img_mean, self.img_std) x_goal = inputs["goal_at_t"] x_time = inputs["t"].squeeze(1) x_rho = torch.norm(x_goal, dim=1) x_phi = torch.atan2(x_goal[:, 1], x_goal[:, 0]) x_rho_emb = self.distance_encoder(x_rho) x_phi_emb = self.angle_encoder(x_phi) x_time_emb = self.time_encoder(x_time) x_rgb_emb = self.rgb_encoder(x_rgb) embeddings = [x_rgb_emb, x_rho_emb, x_phi_emb, x_time_emb] x = self.fuse_embedding(torch.cat(embeddings, dim=1)) x, rnn_hxs = self._forward_gru(x, rnn_hxs.squeeze(0), masks) rnn_hxs = rnn_hxs.unsqueeze(0) return self.critic_linear(x), x, rnn_hxs
def predict_deltas(self, x, masks=None): # Transpose multichannel inputs st_1 = process_image(x["rgb_at_t_1"], self.img_mean_t, self.img_std_t) dt_1 = transpose_image(x["depth_at_t_1"]) ego_map_gt_at_t_1 = transpose_image(x["ego_map_gt_at_t_1"]) st = process_image(x["rgb_at_t"], self.img_mean_t, self.img_std_t) dt = transpose_image(x["depth_at_t"]) ego_map_gt_at_t = transpose_image(x["ego_map_gt_at_t"]) # This happens only for a baseline if ("ego_map_gt_anticipated_at_t_1" in x and x["ego_map_gt_anticipated_at_t_1"] is not None): ego_map_gt_anticipated_at_t_1 = transpose_image( x["ego_map_gt_anticipated_at_t_1"]) ego_map_gt_anticipated_at_t = transpose_image( x["ego_map_gt_anticipated_at_t"]) else: ego_map_gt_anticipated_at_t_1 = None ego_map_gt_anticipated_at_t = None # Compute past and current egocentric maps bs = st_1.size(0) pu_inputs_t_1 = { "rgb": st_1, "depth": dt_1, "ego_map_gt": ego_map_gt_at_t_1, "ego_map_gt_anticipated": ego_map_gt_anticipated_at_t_1, } pu_inputs_t = { "rgb": st, "depth": dt, "ego_map_gt": ego_map_gt_at_t, "ego_map_gt_anticipated": ego_map_gt_anticipated_at_t, } pu_inputs = self._safe_cat(pu_inputs_t_1, pu_inputs_t) pu_outputs = self.projection_unit(pu_inputs) pu_outputs_t = {k: v[bs:] for k, v in pu_outputs.items()} pt_1, pt = pu_outputs["occ_estimate"][:bs], pu_outputs["occ_estimate"][ bs:] # Compute relative pose dx = subtract_pose(x["pose_at_t_1"], x["pose_at_t"]) # Estimate pose dx_hat = dx xt_hat = x["pose_at_t"] all_pose_outputs = None if not self.config.ignore_pose_estimator: all_pose_outputs = {} pose_inputs = {} if "rgb" in self.config.pose_predictor_inputs: pose_inputs["rgb_t_1"] = st_1 pose_inputs["rgb_t"] = st if "depth" in self.config.pose_predictor_inputs: pose_inputs["depth_t_1"] = dt_1 pose_inputs["depth_t"] = dt if "ego_map" in self.config.pose_predictor_inputs: pose_inputs["ego_map_t_1"] = pt_1 pose_inputs["ego_map_t"] = pt if self.config.detach_map: for k in pose_inputs.keys(): pose_inputs[k] = pose_inputs[k].detach() n_pose_inputs = self._transform_observations(pose_inputs, dx) pose_outputs = self.pose_estimator(n_pose_inputs) dx_hat = add_pose(dx, pose_outputs["pose"]) all_pose_outputs["pose_outputs"] = pose_outputs # Estimate global pose xt_hat = add_pose(x["pose_hat_at_t_1"], dx_hat) # Zero out pose prediction based on the mask if masks is not None: xt_hat = xt_hat * masks dx_hat = dx_hat * masks outputs = { "pt": pt, "dx_hat": dx_hat, "xt_hat": xt_hat, "all_pu_outputs": pu_outputs_t, "all_pose_outputs": all_pose_outputs, } if "ego_map_hat" in pu_outputs_t: outputs["ego_map_hat_at_t"] = pu_outputs_t["ego_map_hat"] return outputs