def fuse_poses(opt, outputs, pose_fuse_mlp): def transformation_to_tensor(tr_batch): R, t = rot_translation_from_transformation(tr_batch) return torch.cat([R.reshape(-1, 9), t.reshape(-1, 3)], dim=1) for f_id in opt.frame_ids[1:]: pose_net = transformation_to_tensor(outputs[("cam_T_cam", 0, f_id)]) pose_imu = transformation_to_tensor(outputs[("cam_T_cam_imu", 0, f_id)]) pose_fuse_input = torch.cat([pose_net, pose_imu], dim=1) pose_fuse_output = pose_fuse_mlp(pose_fuse_input) axisangle = pose_fuse_output[:, :3].reshape(-1, 1, 3) tr = pose_fuse_output[:, 3:6].reshape(-1, 1, 3) T = transformation_from_parameters(axisangle, tr) outputs[("cam_T_cam_fuse", 0, f_id)] = T
def get_gt_poses(configs: List[Config]): for config in configs: with config.pose_data as d: for j in range(d.absolute_pose.shape[0] - 2): i = j + 1 start = d.absolute_pose[i] end = d.absolute_pose[i + 1] transform = end[:3] - start[:3] start_dir = start[3:] end_dir = end[3:] # http://www.euclideanspace.com/maths/algebra/vectors/angleBetween/index.htm angle = np.acos(np.dot(start_dir, end_dir)) axis = np.cross(start_dir, end_dir) # normalize to unit vector axis = axis / np.linalg.norm(axis) yield transformation_from_parameters(angle * axis, transform)
def pose_infer(self, img1, img2): feed_height = self.opt.height feed_width = self.opt.width input_image1_resized = img1.resize((feed_width, feed_height), pil.LANCZOS) input_image2_resized = img2.resize((feed_width, feed_height), pil.LANCZOS) input_image1_pytorch = transforms.ToTensor()( input_image1_resized).unsqueeze(0) input_image2_pytorch = transforms.ToTensor()( input_image2_resized).unsqueeze(0) input_images_pytorch = torch.cat( [input_image1_pytorch, input_image2_pytorch], 1) with torch.no_grad(): features = self.pose_encoder(input_images_pytorch) axisangle, translation = self.pose_decoder([features]) transf_mat = transformation_from_parameters( axisangle[:, 0], translation[:, 0]).cpu().numpy() return transf_mat
def evaluate(opt): pose_errors = [] pose_encoder, pose_decoder = prepare_model_for_test(opt) filenames = readlines('./splits/scannet_test_pose_deepv2d.txt') dataset = ScannetTestPoseDataset( opt.data_path, filenames, opt.height, opt.width, frame_idxs=opt.frame_ids, ) dataloader = DataLoader( dataset, 1, shuffle=False, num_workers=opt.num_workers, ) print("-> Computing pose predictions") with torch.no_grad(): for ind, inputs in enumerate(tqdm(dataloader)): for key, ipt in inputs.items(): inputs[key] = ipt.cuda() color = torch.cat( [inputs[("color", i, 0)] for i in opt.frame_ids], axis=1, ) features = pose_encoder(color) axisangle, translation = pose_decoder([features]) this_pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) this_pose = this_pose.data.cpu().numpy()[0] gt_pose = inputs['pose_gt'].data.cpu().numpy()[0] pose_errors.append(compute_pose_errors(this_pose, gt_pose)) mean_pose_errors = np.array(pose_errors).mean(0) print("\n " + ("{:>8} | " * 3).format("rot", "tdeg", "tcm")) print(("&{: 8.3f} " * 3).format(*mean_pose_errors.tolist()) + "\\\\") print("\n-> Done!")
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10", \ "eval_split should be either odom_9 or odom_10" sequence_id = int(opt.eval_split.split("_")[1]) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat([inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv(np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 5 for i in range(0, num_frames - 1): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i:i + track_length - 1])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format(np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) sequence_id = int(opt.eval_split.split("_")[1]) opt.batch_size = 1 filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, -1, 1], 4, 1, is_train=False, img_ext='.png') dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) # pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" cfg.merge_from_file(config_file) cfg.freeze() maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth" pose_encoder = networks.ResnetEncoder(cfg, maskrcnn_path) # pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) # pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(len(opt.frame_ids)) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") # opt.frame_ids = [0, 1] # pose network only takes two frames as input ii = 0 with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): if isinstance(ipt, torch.Tensor): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids]) all_features = pose_encoder(all_color_aug) all_features = [ torch.split(f, opt.batch_size) for f in all_features ] features = {} for i, k in enumerate(opt.frame_ids): features[k] = [f[i] for f in all_features] pose_inputs = [features[i] for i in opt.frame_ids if i != "s"] axisangle, translation = pose_decoder(pose_inputs) if ii == 0: pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0], True).cpu().numpy()) pred_poses.append( transformation_from_parameters(axisangle[:, 1], translation[:, 1]).cpu().numpy()) if ii % opt.log_frequency == 0: print("{:04d}-th image processing".format(ii)) ii += 1 # pred_poses.append( # transformation_from_parameters(axisangle[:, 1], translation[:, 1]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) gt_poses_path = os.path.join( "/usr/stud/linp/storage/user/linp/results/kitti", "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape((-1, 3, 4)) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 3 for i in range(0, num_frames - 1): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array( dump_xyz(gt_local_poses[i:i + track_length - 1])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) ''' for i in range(0, num_frames - 2): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i + 1:i + track_length])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) ''' print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.5, 0, 0.5, 0], [0, 1.656, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", opt.eval_split, "test_files.txt")) dataset = AirSimDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) depth_encoder = networks.ResnetEncoder(opt.num_layers, False) depth_encoder_dict = torch.load(depth_encoder_path) model_dict = depth_encoder.state_dict() depth_encoder.load_state_dict( {k: v for k, v in depth_encoder_dict.items() if k in model_dict}) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc) depth_decoder.load_state_dict(torch.load(depth_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() depth_encoder.cuda() depth_encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_poses = [] pred_disps = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: input_color = inputs[("color", 0, 0)].cuda() depth_output = depth_decoder(depth_encoder(input_color)) pred_disp, _ = disp_to_depth(depth_output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) pred_disps = np.concatenate(pred_disps) gt_norms_div = [] gt_norms = [] pred_norms = [] trans_pred = pred_pose[:, :3, 3] gt_poses_path = os.path.join(opt.data_path, "poses.txt") gt_local_poses = read_pose(gt_poses_path) num_frames = gt_local_poses.shape[0] for i in range(num_frames): local_xyzs = pred_poses[i, :3, 3] gt_local_xyzs = gt_local_poses[i, :3, 3] gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm( local_xyzs) gt_norms_div.append(gt_norm_div) save_path = os.path.join(os.path.dirname(__file__), "gt_norms_div_AirSim.npy") np.save(save_path, gt_norms_div) print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) # Depth encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() # Pose pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() if opt.use_imu: imu_lstm = nn.LSTM(6, opt.lstm_hidden_size, opt.lstm_num_layers) imu_lstm.cuda() imu_lstm.eval() lstm_hs = None hidden_to_imu = torch.nn.Sequential( torch.nn.Linear(opt.lstm_hidden_size, 6), ) hidden_to_imu.cuda() hidden_to_imu.eval() if opt.pose_fuse: pose_fuse_mlp = torch.nn.Sequential( torch.nn.Linear(24, opt.pose_mlp_hidden_size), torch.nn.Sigmoid(), torch.nn.Linear(opt.pose_mlp_hidden_size, 6), ) pose_fuse_mlp.cuda() pose_fuse_mlp.eval() img_ext = '.png' if opt.png else '.jpg' pred_disps = [] scale_factors = [] kitty_odom = False if opt.eval_split.startswith("odom"): kitty_odom = True if kitty_odom: ids = [int(opt.eval_split.split("_")[1])] else: splits_dir = os.path.join(os.path.dirname(__file__), "splits") videonames = readlines( os.path.join(splits_dir, opt.eval_split, "test_video_list.txt")) ids = videonames for videoname in ids: if kitty_odom: filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files_{:02d}.txt".format(videoname))) else: filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) if kitty_odom: dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False, use_imu=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) else: if opt.use_imu: dataset = SequenceRawKittiDataset( opt.data_path, [videoname], filenames, 1, imu_data_path=opt.imu_data_path, img_ext=img_ext, frame_idxs=[0, 1], height=encoder_dict['height'], width=encoder_dict['width'], num_scales=4, is_train=False) dataloader = DataLoader(dataset, shuffle=False, num_workers=0) else: filenames = list( filter(lambda f: f.startswith(videoname), filenames)) dataset = KITTIRAWDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False, use_imu=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) # pred_poses = [np.eye(4).reshape(1, 4, 4)] pred_poses = [] imu_scale_factors = [] print("EVALUATING ", opt.model_name) print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() if opt.use_imu: inputs[key] = inputs[key].squeeze(0) input_color = inputs[("color", 0, 0)] feature = encoder(input_color) output = depth_decoder(feature) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) all_color_aug = torch.cat([ inputs[("color_aug", i, 0)] for i in sorted(opt.frame_ids) ], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) outputs = {} outputs[("cam_T_cam", 0, 1)] = transformation_from_parameters(axisangle[:, 0], translation[:, 0], invert=False) T = outputs[("cam_T_cam", 0, 1)] if opt.use_imu: outputs = predict_poses_from_imu2(opt, inputs, imu_lstm, lstm_hs, hidden_to_imu) T_better = outputs[("cam_T_cam_imu", 0, 1)] if opt.pose_fuse: fuse_poses(opt, outputs, pose_fuse_mlp) T_better = outputs[("cam_T_cam_fuse", 0, 1)] R, t = rot_translation_from_transformation(T) Rb, tb = rot_translation_from_transformation(T_better) imu_scale_factor = torch.sum(tb * t) / torch.sum(t**2) imu_scale_factors.append(imu_scale_factor.cpu().numpy()) # scale_factors.append(imu_scale_factors) T = T_better pred_poses.append(T.cpu().numpy()) pred_poses = np.concatenate(pred_poses) if opt.eval_split.startswith("odom"): gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(videoname)) else: gt_poses_path = os.path.join(opt.data_path, videoname, "oxts", "poses.txt") eval_pose(opt, pred_poses, gt_poses_path) scale_factors = {} if imu_scale_factors: scale_factors["IMU factor"] = imu_scale_factors pred_disps = np.concatenate(pred_disps) if not kitty_odom: eval_depth(opt, pred_disps, scale_factors)
def test_depth_pose(args): """Function to predict depth and pose """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained depth encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) print(" Loading pretrained pose encoder") pose_encoder = networks.ResnetEncoder(18, False, 2) loaded_dict_pose_enc = torch.load(pose_encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) pose_encoder.load_state_dict(loaded_dict_pose_enc) encoder.to(device) pose_encoder.to(device) encoder.eval() pose_encoder.eval() print(" Loading pretrained depth decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) print(" Loading pretrained pose decoder") pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) loaded_dict_pose = torch.load(pose_decoder_path, map_location=device) pose_decoder.load_state_dict(loaded_dict_pose) depth_decoder.to(device) pose_decoder.to(device) depth_decoder.eval() pose_decoder.eval() print("-> Predicting on test images") pred_depths = [] pred_poses = [] backproject_depth = BackprojectDepth(1, feed_height, feed_width) backproject_depth.to(device) project_3d = Project3D(1, feed_height, feed_width) project_3d.to(device) K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) K[0, :] *= feed_width K[1, :] *= feed_height inv_K = np.linalg.pinv(K) K = torch.from_numpy(K) K = K.unsqueeze(0).to(device) inv_K = torch.from_numpy(inv_K) inv_K = inv_K.unsqueeze(0).to(device) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for i in range(107): # Load image and preprocess image_0_path = './kitti_data/01/{:010d}.jpg'.format(i) input_image_0 = Image.open(image_0_path).convert('RGB') original_width, original_height = input_image_0.size input_image_0 = input_image_0.resize((feed_width, feed_height), Image.LANCZOS) input_image_0 = transforms.ToTensor()(input_image_0).unsqueeze(0) image_1_path = './kitti_data/01/{:010d}.jpg'.format(i + 1) input_image_1 = Image.open(image_1_path).convert('RGB') input_image_1 = input_image_1.resize((feed_width, feed_height), Image.LANCZOS) input_image_1 = transforms.ToTensor()(input_image_1).unsqueeze(0) # PREDICTION for depth input_image_0 = input_image_0.to(device) features = encoder(input_image_0) outputs = depth_decoder(features) disp = outputs[("disp", 0)] #disp_resized = torch.nn.functional.interpolate( # disp, (original_height, original_width), mode="bilinear", align_corners=False) _, pred_depth = disp_to_depth(disp, 0.1, 100) pred_depth = pred_depth.cpu()[:, 0].numpy() pred_depths.append(pred_depth[0]) print(" Predict Depth {:d}".format(i)) # PREDICTION for pose input_image_1 = input_image_1.to(device) input_image_pose = torch.cat([input_image_0, input_image_1], 1) features_pose = pose_encoder(input_image_pose) features_pose = [features_pose] axisangle, translation = pose_decoder(features_pose) pred_pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) pred_poses.append(pred_pose.cpu()[0].numpy()) print(" Predict Pose {:d}".format(i)) print(pred_pose) # WARPED image if RECONSTRUCTION: print(" Reconstruct image {:d}".format(i)) cam_points = backproject_depth(pred_depth, inv_K) pix_coords = project_3d(cam_points, K, pred_pose) reconstruct_image_0 = torch.nn.functional.grid_sample( input_image_1, pix_coords, padding_mode="border") print(" Saving resonstructed image...") reconstruct_image_0 = torch.nn.functional.interpolate( reconstruct_image_0, (original_height, original_width), mode="bilinear", align_corners=False) reconstruct_image_0_np = reconstruct_image_0.squeeze().cpu( ).numpy() reconstruct_image_0_np = (reconstruct_image_0_np * 255).astype( np.uint8) reconstruct_image_0_np = np.concatenate([ np.expand_dims(reconstruct_image_0_np[i], 2) for i in range(3) ], 2) im = Image.fromarray(reconstruct_image_0_np, mode='RGB') name_dest_im = os.path.join("kitti_data/01", "warped", "{:010d}_warped.jpg".format(i)) im.save(name_dest_im) print("...") np.save('kitti_data/pred_depth_01.npy', np.array(pred_depths)) np.save('kitti_data/pred_pose_01.npy', np.array(pred_poses)) print('-> Done!')
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ conv_layer, data_lambda, intrinsics = get_params(opt) configs = load_csv(opt.test_data) dataset = CarlaDataset(configs, data_lambda, intrinsics, [0, 1], 4, is_train=False, is_cubemap=opt.mode is Mode.Cubemap, width=opt.width, height=opt.height) dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) if opt.eval_model is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) else: if opt.load_weights_folder is not None: raise ValueError( "Can't specify eval_model and load_weights_folder, they conflict" ) opt.eval_model = Path(opt.eval_model) models = Path(opt.eval_model) / "models" weights = [p for p in models.iterdir() if p.name.startswith("weights")] weights = [int(p.name.split("_")[1]) for p in weights] opt.load_weights_folder = models / f"weights_{max(weights)}" # assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(conv_layer, opt.num_layers, False, 2) pose_encoder.load_state_dict(un_mod(torch.load(pose_encoder_path))) pose_decoder = networks.PoseDecoder(conv_layer, pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(un_mod(torch.load(pose_decoder_path))) if opt.mode is Mode.Cubemap: cube_poses = CubePosesAndLoss(include_loss=False) cube_poses.cuda() cube_poses.eval() pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) cam_T_cam = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) if opt.mode is Mode.Cubemap: cam_T_cam = cube_poses(cam_T_cam) pred_poses.append(cam_T_cam.cpu().numpy()) pred_poses = np.concatenate(pred_poses) ates = [] num_frames = pred_poses.shape[0] gt_poses = get_gt_poses(configs) for i in range(0, num_frames - 1): gt_pose = next(gt_poses) local_xyzs = np.array(dump_xyz(pred_poses[np.newaxis, i])) gt_local_xyzs = np.array(dump_xyz(gt_pose[np.newaxis, ...])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def evaluate_pose(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_09" or opt.eval_split == "odom_10", \ "eval_split should be either odom_9 or odom_10" device = torch.device("cpu" if opt.no_cuda else "cuda") sequence_id = int(opt.eval_split.split("_")[-1]) if opt.pose_model_input == "pairs": opt.frame_ids = [1, 0] # pose network only takes two frames as input num_poses = 1 filenames = readlines( os.path.join( os.path.dirname(__file__), "splits", "odom", "test_files_{}_{:02d}.txt".format("pairs", sequence_id))) else: opt.frame_ids = [i for i in opt.frame_ids if i != "s"] num_poses = len(opt.frame_ids) - 1 filenames = readlines( os.path.join( os.path.dirname(__file__), "splits", "odom", "test_files_{}_{:02d}.txt".format("all" + str(num_poses + 1), sequence_id))) img_ext = '.png' if opt.png else '.jpg' dataset = datasets_dict[opt.eval_split](opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, img_ext=img_ext) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, num_poses + 1) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, num_poses, 1) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() pose_decoder.to(device) pose_decoder.eval() pred_poses = [] flip_pred_poses = [] print("-> Computing pose predictions") with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.to(device) all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) if opt.post_process: # Left-Right Flip as Post-processing to further improve accuracy of pose estimation all_color_aug = torch.cat( (all_color_aug, torch.flip(all_color_aug, [3])), 0) features = pose_encoder(all_color_aug) axisangle, translation = pose_decoder(features) if opt.post_process: N = axisangle.shape[0] // 2 pred_poses.append( transformation_from_parameters( axisangle[:N].view(N * num_poses, 1, 3), translation[:N].view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) flip_pred_poses.append( transformation_from_parameters( axisangle[N:].view(N * num_poses, 1, 3), translation[N:].view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) else: N = axisangle.shape[0] pred_poses.append( transformation_from_parameters( axisangle.view(N * num_poses, 1, 3), translation.view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) pred_poses = np.concatenate(pred_poses) if opt.post_process: flip_pred_poses = np.concatenate(flip_pred_poses) flip_pred_poses[:, :, 1:3, 0] *= -1 flip_pred_poses[:, :, 0, 1:] *= -1 pred_poses = average_poses(np.array([pred_poses, flip_pred_poses])) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i])) gt_local_poses = np.expand_dims(np.array(gt_local_poses), axis=1) ATEs = [] REs = [] num_frames = gt_global_poses.shape[0] track_length = 5 for i in range(0, num_frames - track_length): gt_odometry = local_poses_to_odometry(gt_local_poses[i:i + track_length - 1]) pred_odometry = local_poses_to_odometry(pred_poses[i:i + track_length - num_poses]) ATE, RE = compute_pose_error(gt_odometry, pred_odometry) ATEs.append(ATE) REs.append(RE) print("\n Trajectory error: \n" " ATE: {:0.4f}, std: {:0.4f} \n" " RE: {:0.4f}, std: {:0.4f} \n ".format(np.mean(ATEs), np.std(ATEs), np.mean(REs), np.std(REs))) # compute the global monocular visual odometry and save it global_pred_odometry = local_poses_to_odometry(pred_poses) save_filename = opt.eval_split if opt.post_process: save_filename = save_filename + "_pp" save_path = os.path.join(opt.load_weights_folder, save_filename + ".txt") np.savetxt(save_path, global_pred_odometry[:, :-1, :].reshape( global_pred_odometry.shape[0], -1), delimiter=' ', fmt='%1.8e') print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10" or opt.eval_split == "odom_0", \ "eval_split should be either odom_9 or odom_10" sequence_id = int(opt.eval_split.split("_")[1]) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) depth_encoder = networks.ResnetEncoder(opt.num_layers, False) depth_encoder_dict = torch.load(depth_encoder_path) model_dict = depth_encoder.state_dict() depth_encoder.load_state_dict( {k: v for k, v in depth_encoder_dict.items() if k in model_dict}) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc) depth_decoder.load_state_dict(torch.load(depth_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() depth_encoder.cuda() depth_encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_poses = [] pred_disps = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: input_color = inputs[("color", 0, 0)].cuda() depth_output = depth_decoder(depth_encoder(input_color)) pred_disp, _ = disp_to_depth(depth_output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) pred_disps = np.concatenate(pred_disps) pred_poses_scaled = [] ratios_d = [] gt_norms_div = [] gt_norms = [] pred_norms = [] td_divs_dgc = [] poses_pred = [] for i in range(pred_poses.shape[0]): pred_pose = pred_poses[i] pred_disp = pred_disps[i + 1] pred_depth = 1 / pred_disp scale_recovery = ScaleRecovery(1, 192, 640, K).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio = scale_recovery(pred_depth).cpu().item() pred_pose_scaled = pred_pose[:3, 3] * ratio poses_pred.append(pred_pose[:3, 3]) pred_poses_scaled.append(pred_pose_scaled) ratios_d.append(ratio) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 5 for i in range(0, num_frames - 1): local_xyzs = np.array( dump_xyz(pred_poses_scaled[i:i + track_length - 1])) gt_local_xyzs = np.array( dump_xyz(gt_local_poses[i:i + track_length - 1])) gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm( local_xyzs) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) gt_norms_div.append(gt_norm_div) gt_norms.append(np.linalg.norm(gt_local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(os.path.dirname(__file__), "poses_scaled{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_gt{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_pred{:02d}.npy".format(sequence_id)) np.save(save_path, gt_xyzs) save_path = os.path.join(os.path.dirname(__file__), "gt_norms{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms) save_path = os.path.join(os.path.dirname(__file__), "gt_norms_div{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms_div) save_path = os.path.join(os.path.dirname(__file__), "ratios_d{:02d}.npy".format(sequence_id)) np.save(save_path, ratios_d) save_path = os.path.join(os.path.dirname(__file__), "pred_norms{:02d}.npy".format(sequence_id)) np.save(save_path, pred_norms) print("-> Predictions saved to", save_path)
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained depth encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained depth decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) # don't try to predict disparity for a disparity image! paths = [img for img in paths if not img.endswith("_disp.jpg")] if len(paths) > 3: print(" Loading Pose network") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() pose_decoder.to(device) pose_decoder.eval() # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): print("-> Predicting disparities on {:d} test images".format( len(paths))) processed_images = [] for idx, image_path in enumerate(paths): # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) processed_images += [input_image] features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) if len(processed_images) > 3: pred_poses = [] rotations = [] translations = [] print("-> Predicting poses on {:d} test images".format( len(processed_images))) for idx, (a, b) in enumerate( zip(processed_images[:-1], processed_images[1:])): all_color_aug = torch.cat([a, b], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) rotations += [axisangle[:, 0].cpu().numpy()] translations += [translation[:, 0].cpu().numpy()] pred_poses.append( transformation_from_parameters( axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) save_path = os.path.join(args.image_path, "pred_poses.npy") np.save(save_path, pred_poses) print("-> Pose Predictions saved to", save_path) local_xyzs = np.array(dump_xyz(pred_poses)) save_path = os.path.join(args.image_path, "pred_xyzs.npy") np.save(save_path, local_xyzs) print("-> Predicted path saved to", save_path) save_path = os.path.join(args.image_path, "axisangle.npy") np.save(save_path, np.concatenate(rotations)) print("-> Predicted axis angles saved to", save_path) save_path = os.path.join(args.image_path, "translation.npy") np.save(save_path, np.concatenate(translations)) print("-> Predicted translations saved to", save_path) print('-> Done!')
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("Loading pose networks") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() bag_name = '2019-12-17-13-24-03' map_name = "feature=base&ver=2019121700&base_pt=(32.75707,-111.55757)&end_pt=(32.092537212,-110.7892506)" begin = '0:36:00' end = '0:37:00' output_directory = "assets/" dataset = TSDataset(bag_name, begin, end) pred_depth = [] pred_poses = [] last_img = None # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, input_image in enumerate(dataset): # Load image and preprocess original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) pred_depth.append(im) # Handle pose if last_img is None: last_img = input_image all_color_aug = torch.cat([last_img, input_image], 1) last_img = input_image features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy() pred_poses.append(pose) print(" Processed {:d} of {:d} images".format( idx + 1, len(dataset))) pred_poses = np.concatenate(pred_poses, axis=0) print(pred_poses.shape) np.save("poses.npy", pred_poses) # save_video(pred_depth) print('-> Done!')