def local_info_full(self): ''' return local info, including { cam_intrins, ref_dats, src_dats, src_cam_poses, is_valid } each is a list of variables src_cam_poses[i] - 1 x n_src x 4 x 4 ''' is_valid = np.ones(self.batch_size, np.bool) ref_dats = [] src_dats = [] left_cam_intrins = [] right_cam_intrins = [] left_src_cam_poses = [] right_src_cam_poses = [] T_left2right = None for ibatch in range(self.batch_size): dat_array_ = self.dat_arrays[ibatch] ref_dat_, src_dat_ = m_misc.split_frame_list(dat_array_, self.t_win_r) #src_dat_ = dat_array_ #ref_indx = int(len(dat_array_)/2) #ref_dat_ = src_dat_[ref_indx] is_valid_ = _check_datArray_pose(dat_array_) is_valid[ibatch] = is_valid_ if is_valid_: left_src_cam_extMs = m_misc.get_entries_list_dict_level(src_dat_, 'extM', "left_camera") left_src_cam_pose_ = [ warp_homo.get_rel_extrinsicM(ref_dat_["left_camera"]['extM'], src_cam_extM_) for src_cam_extM_ in left_src_cam_extMs ] left_src_cam_pose_ = [ torch.from_numpy(pose.astype(np.float32)).unsqueeze(0) for pose in left_src_cam_pose_ ] left_src_cam_pose_ = torch.cat(left_src_cam_pose_, dim=0).unsqueeze(0) # size: N V 4 4 right_src_cam_extMs = m_misc.get_entries_list_dict_level(src_dat_, 'extM', "right_camera") right_src_cam_pose_ = [warp_homo.get_rel_extrinsicM(ref_dat_["right_camera"]['extM'], src_cam_extM_) for src_cam_extM_ in right_src_cam_extMs] right_src_cam_pose_ = [torch.from_numpy(pose.astype(np.float32)).unsqueeze(0) for pose in right_src_cam_pose_] right_src_cam_pose_ = torch.cat(right_src_cam_pose_, dim=0).unsqueeze(0) # size: N V 4 4 T_left2right = torch.tensor(ref_dat_["T_left2right"].astype(np.float32)) else: left_src_cam_pose_ = -1 right_src_cam_pose_ = -1 ref_dats.append(ref_dat_) src_dats.append(src_dat_) left_cam_intrins.append(self.dataset_batch[ibatch].left_cam_intrinsics) right_cam_intrins.append(self.dataset_batch[ibatch].right_cam_intrinsics) left_src_cam_poses.append(left_src_cam_pose_) right_src_cam_poses.append(right_src_cam_pose_) return {'is_valid': is_valid, 'left_cam_intrins': left_cam_intrins, 'right_cam_intrins': right_cam_intrins, 'ref_dats': ref_dats, 'src_dats': src_dats, 'left_src_cam_poses': left_src_cam_poses, 'right_src_cam_poses': right_src_cam_poses, "T_left2right": T_left2right}
def load_sample(self, list_ids): imgs, sdepths, cfds, Es, Ks, crops, gts = [], [], [], [], [], [], [] for i in list_ids: name_video = self.dataset_batch[i] crop_at = torch.zeros(2) crop_at[0] = 130 + (240 - self.img_size[0]) // 2 crop_at[1] = (1200 - self.img_size[1]) // 2 K = self.intrinsic_data[name_video] K = torch.from_numpy(K).unsqueeze(0).float() Rt_cur = self.extrinsic_data[name_video][self.ptrs[i]] Rt_prev2cur = torch.eye(4).unsqueeze(0) if self.prev_poses[i] is not None: Rt_prev2cur = h**o.get_rel_extrinsicM(self.prev_poses[i], Rt_cur) Rt_prev2cur = torch.from_numpy(Rt_prev2cur).unsqueeze( 0).float() self.ptrs[i] += 1 self.prev_poses[i] = Rt_cur self.num_processed_frames += 1 path_img = self.path_images[name_video][self.ptrs[i]] img = Image.open(path_img) img = self.transform_rgb(img) path_sdepth = self.path_sdepths[name_video][self.ptrs[i]] sdepth = Image.open(path_sdepth) sdepth = self.transform_depth(sdepth) path_gt_depth = self.path_gt_depths[name_video][self.ptrs[i]] gt = Image.open(path_gt_depth) gt = self.transform_depth(gt) sdepth = sdepth.type(torch.FloatTensor) / 256 gt = gt.type(torch.FloatTensor) / 256 c = (sdepth > 0) imgs.append(img.unsqueeze(0).float()) sdepths.append(sdepth.unsqueeze(0).float()) cfds.append(c.unsqueeze(0).float()) Es.append(Rt_prev2cur) Ks.append(K) crops.append(crop_at.unsqueeze(0)) gts.append(gt.unsqueeze(0).float()) return torch.cat(imgs), torch.cat(sdepths), torch.cat(cfds), torch.cat( Es), torch.cat(Ks), torch.cat(crops), torch.cat(gts)
def local_info(self): ''' return local info, including { cam_intrins, ref_dats, src_dats, src_cam_poses, is_valid } each is a list of variables src_cam_poses[i] - 1 x n_src x 4 x 4 ''' is_valid = np.ones(self.batch_size, np.bool) ref_dats = [] src_dats = [] cam_intrins = [] src_cam_poses = [] for ibatch in range(self.batch_size): dat_array_ = self.dat_arrays[ibatch] ref_dat_, src_dat_ = m_misc.split_frame_list(dat_array_, self.t_win_r) is_valid_ = _check_datArray_pose(dat_array_) is_valid[ibatch] = is_valid_ if is_valid_: src_cam_extMs = m_misc.get_entries_list_dict(src_dat_, 'extM') src_cam_pose_ = [ warp_homo.get_rel_extrinsicM(ref_dat_['extM'], src_cam_extM_) for src_cam_extM_ in src_cam_extMs ] src_cam_pose_ = [ torch.from_numpy(pose.astype(np.float32)).unsqueeze(0) for pose in src_cam_pose_ ] # src_cam_poses size: N V 4 4 src_cam_pose_ = torch.cat(src_cam_pose_, dim=0).unsqueeze(0) else: src_cam_pose_ = -1 ref_dats.append(ref_dat_) src_dats.append(src_dat_) cam_intrins.append(self.dataset_batch[ibatch].cam_intrinsics) src_cam_poses.append(src_cam_pose_) # print(is_valid) return {'is_valid': is_valid, 'cam_intrins': cam_intrins, 'ref_dats': ref_dats, 'src_dats': src_dats, 'src_cam_poses': src_cam_poses }
def main(): import argparse print('Parsing the arguments...') parser = argparse.ArgumentParser() # exp name # parser.add_argument( '--exp_name', required=True, type=str, help='The name of the experiment. Used to naming the folders') # about testing # parser.add_argument('--model_path', type=str, required=True, help='The pre-trained model path for KV-net') parser.add_argument('--split_file', type=str, required=True, help='The split txt file') parser.add_argument('--frame_interv', default=5, type=int, help='frame interval') parser.add_argument('--t_win', type=int, default=2, help='The radius of the temporal window; default=2') parser.add_argument('--d_min', type=float, default=0, help='The minimal depth value; default=0') parser.add_argument('--d_max', type=float, default=5, help='The maximal depth value; default=15') parser.add_argument('--ndepth', type=int, default=64, help='The # of candidate depth values; default= 128') parser.add_argument('--sigma_soft_max', type=float, default=10., help='sigma_soft_max, default = 500.') parser.add_argument( '--feature_dim', type=int, default=64, help='The feature dimension for the feature extractor; default=64') # about dataset # parser.add_argument('--dataset', type=str, default='scanNet', help='Dataset name: {scanNet, 7scenes, kitti}') parser.add_argument('--dataset_path', type=str, default='.', help='Path to the dataset') parser.add_argument( '--change_aspect_ratio', action='store_true', default=False, help= 'If we want to change the aspect ratio. This option is only useful for KITTI' ) # parsing parameters # args = parser.parse_args() exp_name = args.exp_name dataset_name = args.dataset t_win_r = args.t_win nDepth = args.ndepth d_candi = np.linspace(args.d_min, args.d_max, nDepth) sigma_soft_max = args.sigma_soft_max #10.#500. dnet_feature_dim = args.feature_dim frame_interv = args.frame_interv # should be multiple of 5 for scanNet dataset d_upsample = None d_candi_dmap_ref = d_candi nDepth_dmap_ref = nDepth # ===== Dataset selection ======== # dataset_path = args.dataset_path if dataset_name == 'scanNet': import mdataloader.scanNet as dl_scanNet dataset_init = dl_scanNet.ScanNet_dataset fun_get_paths = lambda traj_indx: dl_scanNet.get_paths( traj_indx, frame_interv=5, split_txt=split_file, database_path_base=dataset_path) img_size = [384, 256] # trajectory index for testing # n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) elif dataset_name == '7scenes': # 7 scenes video # import mdataloader.dl_7scenes as dl_7scenes dataset_init = dl_7scenes.SevenScenesDataset dat_indx_step = 3 split_file = None if args.split_file == '.' else args.split_file fun_get_paths = lambda traj_indx: dl_7scenes.get_paths_1frame(\ traj_indx, database_path_base = dataset_path , split_txt = split_file, dat_indx_step = dat_indx_step) img_size = [384, 256] n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) elif dataset_name == 'kitti': import mdataloader.kitti as dl_kitti dataset_init = dl_kitti.KITTI_dataset fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val') if not dataset_path == '.': fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val', database_path_base=dataset_path) else: # use default database path fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val') if not args.change_aspect_ratio: # we will keep the aspect ratio and do cropping img_size = [768, 256] crop_w = None else: # we will change the aspect ratio and NOT do cropping img_size = [384, 256] crop_w = None n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) else: raise Exception('dataset loader not implemented') fldr_path, img_paths, dmap_paths, poses, intrin_path = fun_get_paths(0) if dataset_name == 'kitti': dataset = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.25, crop_w=crop_w) dataset_imgsize = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=1) else: dataset = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.25) dataset_imgsize = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=1) # ================================ # print('Initnializing the KV-Net') model_KVnet = m_kvnet.KVNET(feature_dim=dnet_feature_dim, cam_intrinsics=dataset.cam_intrinsics, d_candi=d_candi, sigma_soft_max=sigma_soft_max, KVNet_feature_dim=dnet_feature_dim, d_upsample_ratio_KV_net=d_upsample, t_win_r=t_win_r, if_refined=True) model_KVnet = torch.nn.DataParallel(model_KVnet) model_KVnet.cuda() model_path_KV = args.model_path print('loading KV_net at %s' % (model_path_KV)) utils_model.load_pretrained_model(model_KVnet, model_path_KV) print('Done') for traj_idx in traj_Indx: res_fldr = '../results/%s/traj_%d' % (exp_name, traj_idx) m_misc.m_makedir(res_fldr) scene_path_info = [] print('Getting the paths for traj_%d' % (traj_idx)) fldr_path, img_seq_paths, dmap_seq_paths, poses, intrin_path = fun_get_paths( traj_idx) dataset.set_paths(img_seq_paths, dmap_seq_paths, poses) if dataset_name is 'scanNet': # For each trajector in the dataset, we will update the intrinsic matrix # dataset.get_cam_intrinsics(intrin_path) print('Done') dat_array = [dataset[idx] for idx in range(t_win_r * 2 + 1)] DMaps_meas = [] traj_length = len(dataset) print('trajectory length = %d' % (traj_length)) for frame_cnt, ref_indx in enumerate( range(t_win_r, traj_length - t_win_r - 1)): eff_iter = True valid_seq = check_datArray_pose(dat_array) # Read ref. and src. data in the local time window # ref_dat, src_dats = m_misc.split_frame_list(dat_array, t_win_r) if frame_cnt == 0: BVs_predict = None if valid_seq and eff_iter: # Get poses # src_cam_extMs = m_misc.get_entries_list_dict(src_dats, 'extM') src_cam_poses = \ [warp_homo.get_rel_extrinsicM(ref_dat['extM'], src_cam_extM_) \ for src_cam_extM_ in src_cam_extMs ] src_cam_poses = [ torch.from_numpy(pose.astype( np.float32)).cuda().unsqueeze(0) for pose in src_cam_poses ] # src_cam_poses size: N V 4 4 # src_cam_poses = torch.cat(src_cam_poses, dim=0).unsqueeze(0) src_frames = [m_misc.get_entries_list_dict(src_dats, 'img')] if frame_cnt == 0 or BVs_predict is None: # the first window for the traj. BVs_predict_in = None else: BVs_predict_in = BVs_predict print('testing on %d/%d frame in traj %d/%d ... '%\ (frame_cnt+1, traj_length - 2*t_win_r, traj_idx+1, len(traj_Indx)) ) # set trace for specific frame # BVs_measure, BVs_predict = test_KVNet.test( model_KVnet, d_candi, Ref_Dats=[ref_dat], Src_Dats=[src_dats], Cam_Intrinsics=[dataset.cam_intrinsics], t_win_r=t_win_r, Src_CamPoses=src_cam_poses, BV_predict=BVs_predict_in, R_net=True, Cam_Intrinsics_imgsize=dataset_imgsize.cam_intrinsics, ref_indx=ref_indx) # export_res.export_res_refineNet(ref_dat, BVs_measure, d_candi_dmap_ref, # res_fldr, ref_indx, # save_mat = True, output_pngs = False, output_dmap_ref=False) export_res.export_res_img(ref_dat, BVs_measure, d_candi_dmap_ref, res_fldr, frame_cnt) scene_path_info.append( [frame_cnt, dataset[ref_indx]['img_path']]) elif valid_seq is False: # if the sequence contains invalid pose estimation BVs_predict = None print('frame_cnt :%d, include invalid poses' % (frame_cnt)) elif eff_iter is False: BVs_predict = None # Update dat_array # dat_array.pop(0) dat_array.append(dataset[ref_indx + t_win_r + 1]) m_misc.save_ScenePathInfo('%s/scene_path_info.txt' % (res_fldr), scene_path_info)
def main(): import argparse print('Parsing the arguments...') parser = argparse.ArgumentParser() # exp name # parser.add_argument( '--exp_name', required=True, type=str, help='The name of the experiment. Used to naming the folders') # about testing # parser.add_argument('--img_name_pattern', type=str, default='*.png', help='image name pattern') parser.add_argument('--model_path', type=str, default='.', help='The pre-trained model path for KV-net') parser.add_argument('--split_file', type=str, default='.', help='The split txt file') parser.add_argument('--t_win', type=int, default=2, help='The radius of the temporal window; default=2') parser.add_argument('--d_min', type=float, default=0, help='The minimal depth value; default=0') parser.add_argument('--d_max', type=float, default=5, help='The maximal depth value; default=15') parser.add_argument('--ndepth', type=int, default=64, help='The # of candidate depth values; default= 128') parser.add_argument('--sigma_soft_max', type=float, default=10., help='sigma_soft_max, default = 500.') parser.add_argument( '--feature_dim', type=int, default=64, help='The feature dimension for the feature extractor; default=64') # about pose # parser.add_argument('--intrin_path', type=str, required=True, help='camera intrinic path, saved as .mat') parser.add_argument( '--dso_res_path', type=str, default='dso_res/result_dso.txt', help= 'if use DSO pose, specify the path to the DSO results. Should be a .txt file' ) parser.add_argument('--opt_next_frame', action='store_true', help='') parser.add_argument('--use_gt_R', action='store_true', help='') parser.add_argument('--use_gt_t', action='store_true', help='') parser.add_argument('--use_dso_R', action='store_true', help='') parser.add_argument('--use_dso_t', action='store_true', help='') parser.add_argument('--min_frame_idx', type=int, help=' ', default=0) parser.add_argument('--max_frame_idx', type=int, help=' ', default=10000) parser.add_argument('--refresh_frames', type=int, help=' ', default=1000) parser.add_argument('--LBA_max_iter', type=int, help=' ') parser.add_argument('--opt_r', type=int, default=1, help=' ') parser.add_argument('--opt_t', type=int, default=1, help=' ') parser.add_argument('--LBA_step', type=float, help=' ') parser.add_argument('--frame_interv', type=int, default=5, help=' ') # about dataset # parser.add_argument('--dataset', type=str, default='7scenes', help='Dataset name: {scanNet, 7scenes}') parser.add_argument('--dataset_path', type=str, default='.', help='Path to the dataset') # about output # parser.add_argument('--output_pngs', action='store_true', help='if output pngs') # para config. # args = parser.parse_args() exp_name = args.exp_name dataset_name = args.dataset t_win_r = args.t_win nDepth = args.ndepth d_candi = np.linspace(args.d_min, args.d_max, nDepth) sigma_soft_max = args.sigma_soft_max #10.#500. dnet_feature_dim = args.feature_dim frame_interv = args.frame_interv d_candi_dmap_ref = d_candi nDepth_dmap_ref = nDepth # Initialize data-loader, model and optimizer # # ===== Dataset selection ======== # dataset_path = args.dataset_path if dataset_name == 'scanNet': # deal with 1-frame scanNet data import mdataloader.scanNet as dl_scanNet dataset_init = dl_scanNet.ScanNet_dataset split_txt = './mdataloader/scanNet_split/scannet_val.txt' if args.split_file == '.' else args.split_file if not dataset_path == '.': # if specify the path, we will assume we are using 1-frame-interval scanNet video # fun_get_paths = lambda traj_indx: dl_scanNet.get_paths_1frame( traj_indx, database_path_base=dataset_path, split_txt=split_txt) dat_indx_step = 5 #pick this value to make sure the camera baseline is big enough else: fun_get_paths = lambda traj_indx: dl_scanNet.get_paths( traj_indx, frame_interv=5, split_txt=split_txt) dat_indx_step = 1 img_size = [384, 256] # trajectory index for training # n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) elif dataset_name == '7scenes': # 7 scenes video # import mdataloader.dl_7scenes as dl_7scenes img_size = [384, 256] dataset_init = dl_7scenes.SevenScenesDataset dat_indx_step = 5 # pick this value to make sure the camera baseline is big enough # trajectory index for training # split_file = None if args.split_file == '.' else args.split_file fun_get_paths = lambda traj_indx: dl_7scenes.get_paths_1frame( traj_indx, database_path_base=dataset_path, split_txt=split_file, ) elif dataset_name == 'single_folder': # images in a single folder specified by the user # import mdataloader.mdata as mdata img_size = [384, 256] dataset_init = mdata.mData dat_indx_step = 5 # pick this value to make sure the camera baseline is big enough fun_get_paths = lambda traj_indx: mdata.get_paths_1frame( traj_indx, dataset_path, args.img_name_pattern) traj_Indx = [0] #dummy fldr_path, img_paths, dmap_paths, poses, intrin_path = fun_get_paths( traj_Indx[0]) if dataset_name == 'single_folder': intrin_path = args.intrin_path dataset = dataset_init( True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.25, ) dataset_Himgsize = dataset_init( True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.5, ) dataset_imgsize = dataset_init( True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=1, ) # ================================ # print('Initnializing the KV-Net') model_KVnet = m_kvnet.KVNET(\ feature_dim = dnet_feature_dim, cam_intrinsics = dataset.cam_intrinsics, d_candi = d_candi, sigma_soft_max = sigma_soft_max, KVNet_feature_dim = dnet_feature_dim, d_upsample_ratio_KV_net = None, t_win_r = t_win_r, if_refined = True) model_KVnet = torch.nn.DataParallel(model_KVnet) model_KVnet.cuda() model_path_KV = args.model_path print('loading KV_net at %s' % (model_path_KV)) utils_model.load_pretrained_model(model_KVnet, model_path_KV) print('Done') for traj_idx in traj_Indx: scene_path_info = [] print('Getting the paths for traj_%d' % (traj_idx)) fldr_path, img_seq_paths, dmap_seq_paths, poses, intrin_path = fun_get_paths( traj_idx) res_fldr = '../results/%s/traj_%d' % (exp_name, traj_idx) m_misc.m_makedir(res_fldr) dataset.set_paths(img_seq_paths, dmap_seq_paths, poses) if dataset_name == 'scanNet': # the camera intrinsic may be slightly different for different trajectories in scanNet # dataset.get_cam_intrinsics(intrin_path) print('Done') if args.min_frame_idx > 0: frame_idxs = np.arange(args.min_frame_idx - t_win_r, args.min_frame_idx + t_win_r) dat_array = [dataset[idx] for idx in frame_idxs] else: dat_array = [dataset[idx] for idx in range(t_win_r * 2 + 1)] DMaps_meas = [] dso_res_path = args.dso_res_path print('init initial pose from DSO estimations ...') traj_extMs = init_traj_extMs(traj_len=len(dataset), dso_res_path=dso_res_path, if_filter=True, min_idx=args.min_frame_idx, max_idx=args.max_frame_idx) traj_extMs_init = copy_list(traj_extMs) traj_length = min(len(dataset), len(traj_extMs)) first_frame = True for frame_cnt, ref_indx in enumerate( range(t_win_r * dat_indx_step + args.min_frame_idx, traj_length - t_win_r * dat_indx_step - dat_indx_step)): # ref_indx: the frame index for the reference frame # # Read ref. and src. data in the local time window # ref_dat, src_dats = m_misc.split_frame_list(dat_array, t_win_r) src_frame_idx = [ idx for idx in range( ref_indx - t_win_r * dat_indx_step, ref_indx, dat_indx_step) ] + \ [ idx for idx in range( ref_indx + dat_indx_step, ref_indx + t_win_r*dat_indx_step+1, dat_indx_step) ] valid_seq = dso_io.valid_poses(traj_extMs, src_frame_idx) # only look at a subset of frames # if ref_indx < args.min_frame_idx: valid_seq = False if ref_indx > args.max_frame_idx or ref_indx >= traj_length - t_win_r * dat_indx_step - dat_indx_step: break if frame_cnt == 0 or valid_seq is False: BVs_predict = None # refresh # if ref_indx % args.refresh_frames == 0: print('REFRESH !') BVs_predict = None BVs_predict_in = None first_frame = True traj_extMs = copy_list(traj_extMs_init) if valid_seq: # if the sequence does not contain invalid pose estimation # Get poses # src_cam_extMs = [traj_extMs[i] for i in src_frame_idx] ref_cam_extM = traj_extMs[ref_indx] src_cam_poses = [ warp_homo.get_rel_extrinsicM(ref_cam_extM, src_cam_extM_) for src_cam_extM_ in src_cam_extMs ] src_cam_poses = [ torch.from_numpy(pose.astype( np.float32)).cuda().unsqueeze(0) for pose in src_cam_poses ] # Load the gt pose if available # if 'extM' in dataset[0]: src_cam_extMs_ref = [ dataset[i]['extM'] for i in src_frame_idx ] ref_cam_extM_ref = dataset[ref_indx]['extM'] src_cam_poses_ref = [ warp_homo.get_rel_extrinsicM(ref_cam_extM_ref, src_cam_extM_) \ for src_cam_extM_ in src_cam_extMs_ref ] src_cam_poses_ref = [ torch.from_numpy(pose.astype(np.float32)).cuda().unsqueeze(0) \ for pose in src_cam_poses_ref ] # -- Determine the scale, mapping from DSO scale to our working scale -- # if frame_cnt == 0 or BVs_predict is None: # the first window for the traj. _, t_norm_single = get_fb(src_cam_poses, dataset.cam_intrinsics, src_cam_pose_next=None) # We need to heurisitcally determine scale_ without using GT pose # t_norms = get_t_norms(traj_extMs, dat_indx_step) scale_ = d_candi.max() / ( dataset.cam_intrinsics['focal_length'] * np.array(t_norm_single).max() / 2) scale_ = d_candi.max() / ( dataset.cam_intrinsics['focal_length'] * np.array(t_norms).max()) scale_ = d_candi.max() / ( dataset.cam_intrinsics['focal_length'] * np.array(t_norms).mean() / 2) rescale_traj_t(traj_extMs, scale_) traj_extMs_dso = copy_list(traj_extMs) # Get poses # src_cam_extMs = [traj_extMs[i] for i in src_frame_idx] ref_cam_extM = traj_extMs[ref_indx] src_cam_poses = [ warp_homo.get_rel_extrinsicM(ref_cam_extM, src_cam_extM_) for src_cam_extM_ in src_cam_extMs ] src_cam_poses = [ torch.from_numpy(pose.astype( np.float32)).cuda().unsqueeze(0) for pose in src_cam_poses ] # src_cam_poses size: N V 4 4 # src_cam_poses = torch.cat(src_cam_poses, dim=0).unsqueeze(0) src_frames = [m_misc.get_entries_list_dict(src_dats, 'img')] cam_pose_next = traj_extMs[ref_indx + 1] cam_pose_next = torch.FloatTensor( warp_homo.get_rel_extrinsicM(traj_extMs[ref_indx], cam_pose_next)).cuda() BVs_predict_in = None if frame_cnt == 0 or BVs_predict is None \ else BVs_predict BVs_measure, BVs_predict = test_KVNet.test( model_KVnet, d_candi, Ref_Dats=[ref_dat], Src_Dats=[src_dats], Cam_Intrinsics=[dataset.cam_intrinsics], t_win_r=t_win_r, Src_CamPoses=src_cam_poses, BV_predict=BVs_predict_in, R_net=True, cam_pose_next=cam_pose_next, ref_indx=ref_indx) # export_res.export_res_refineNet(ref_dat, BVs_measure, d_candi_dmap_ref, # res_fldr, ref_indx, # save_mat = True, output_pngs = args.output_pngs, output_dmap_ref=False) export_res.export_res_img(ref_dat, BVs_measure, d_candi_dmap_ref, res_fldr, frame_cnt) scene_path_info.append( [frame_cnt, dataset[ref_indx]['img_path']]) # UPDATE dat_array # if dat_indx_step > 1: # use one-interval video and the frame interval is larger than 5 print('updating array ...') dat_array = update_dat_array(dat_array, dataset, data_interv=1, frame_interv=5, ref_indx=ref_indx, t_win_r=t_win_r) print('done') else: dat_array.pop(0) new_dat = dataset[ref_indx + t_win_r + 1] dat_array.append(new_dat) # OPTMIZE POSES # idx_ref_ = ref_indx + 1 cam_pose_nextframe = traj_extMs[idx_ref_] cam_pose_nextframe = torch.FloatTensor( warp_homo.get_rel_extrinsicM(traj_extMs[ref_indx], cam_pose_nextframe)).cuda() # get depth and confidence map # BV_tmp_ = warp_homo.resample_vol_cuda(\ src_vol = BVs_measure, rel_extM = cam_pose_nextframe.inverse(), cam_intrinsic = dataset_imgsize.cam_intrinsics, d_candi = d_candi, d_candi_new = d_candi, padding_value = math.log(1. / float(len(d_candi))) ).clamp(max=0, min=-1000.) dmap_ref = m_misc.depth_val_regression(BVs_measure, d_candi, BV_log=True).squeeze() conf_map_ref, _ = torch.max(BVs_measure.squeeze(), dim=0) dmap_kf = m_misc.depth_val_regression(BV_tmp_.unsqueeze(0), d_candi, BV_log=True).squeeze() conf_map_kf, _ = torch.max(BV_tmp_.squeeze(), dim=0) # setup optimization # cams_intrin = [ dataset.cam_intrinsics, dataset_Himgsize.cam_intrinsics, dataset_imgsize.cam_intrinsics ] dw_scales = [4, 2, 1] LBA_max_iter = args.LBA_max_iter #10 # 20 LBA_step = args.LBA_step #.05 #.01 if LBA_max_iter <= 1: # do not do optimization LBA_step = 0. opt_vars = [args.opt_r, args.opt_t] # initialization for the first time window # if first_frame: first_frame = False # optimize the pose for all frames within the window # if LBA_max_iter <= 1: # for debugging: using GT pose initialization # rel_pose_inits_all_frame, srcs_idx_all_frame = m_misc.get_twin_rel_pose( traj_extMs, idx_ref_, t_win_r * dat_indx_step, 1, use_gt_R=True, use_gt_t=True, dataset=dataset, add_noise_gt=False, noise_sigmas=None) else: rel_pose_inits_all_frame, srcs_idx_all_frame = m_misc.get_twin_rel_pose( traj_extMs, ref_indx, t_win_r * dat_indx_step, 1, use_gt_R=False, use_gt_t=False, dataset=dataset, ) # opt. # img_ref = dataset[ref_indx]['img'] imgs_src = [dataset[i]['img'] for i in srcs_idx_all_frame] conf_map_ref = torch.exp(conf_map_ref).squeeze()**2 rel_pose_opt = opt_pose_numerical.local_BA_direct( img_ref, imgs_src, dmap_ref.unsqueeze(0).unsqueeze(0), conf_map_ref.unsqueeze(0).unsqueeze(0), cams_intrin, dw_scales, rel_pose_inits_all_frame, max_iter=LBA_max_iter, step=LBA_step, opt_vars=opt_vars) # update # for idx, srcidx in enumerate(srcs_idx_all_frame): traj_extMs[srcidx] = np.matmul( rel_pose_opt[idx].cpu().numpy(), traj_extMs[ref_indx]) # for next frame # if LBA_max_iter <= 1: # for debugging: using GT pose init. rel_pose_opt, srcs_idx = m_misc.get_twin_rel_pose( traj_extMs, idx_ref_, t_win_r, dat_indx_step, use_gt_R=True, use_gt_t=True, dataset=dataset, add_noise_gt=False, noise_sigmas=None, ) else: rel_pose_inits, srcs_idx = m_misc.get_twin_rel_pose( traj_extMs, idx_ref_, t_win_r, dat_indx_step, use_gt_R=args.use_gt_R, use_dso_R=args.use_dso_R, use_gt_t=args.use_gt_t, use_dso_t=args.use_dso_t, dataset=dataset, traj_extMs_dso=traj_extMs_dso, opt_next_frame=args.opt_next_frame) img_ref = dataset[idx_ref_]['img'] _, src_dats_opt = m_misc.split_frame_list( dat_array, t_win_r) imgs_src = [dat_['img'] for dat_ in src_dats_opt] img_ref = dataset[idx_ref_]['img'] imgs_src = [dataset[i] for i in srcs_idx] imgs_src = [img_['img'] for img_ in imgs_src] # opt. # conf_map_kf = torch.exp(conf_map_kf).squeeze()**2 rel_pose_opt = \ opt_pose_numerical.local_BA_direct_parallel( img_ref, imgs_src, dmap_kf.unsqueeze(0).unsqueeze(0), conf_map_kf.unsqueeze(0).unsqueeze(0), cams_intrin, dw_scales, rel_pose_inits, max_iter = LBA_max_iter, step = LBA_step, opt_vars = opt_vars) # update # print('idx_ref_: %d' % (idx_ref_)) print('srcs_idx : ') print(srcs_idx) print('updating pose ...') for idx, srcidx in enumerate(srcs_idx): traj_extMs[srcidx] = np.matmul( rel_pose_opt[idx].cpu().numpy(), traj_extMs[idx_ref_]) print('done') else: # if the sequence contains invalid pose estimation BVs_predict = None print('frame_cnt :%d, include invalid poses' % (frame_cnt)) # UPDATE dat_array # if dat_indx_step > 1: # use one-interval video and the frame interval is larger than 5 print('updating array ...') dat_array = update_dat_array(dat_array, dataset, data_interv=1, frame_interv=5, ref_indx=ref_indx, t_win_r=t_win_r) print('done') else: dat_array.pop(0) new_dat = dataset[ref_indx + t_win_r + 1] dat_array.append(new_dat) m_misc.save_ScenePathInfo('%s/scene_path_info.txt' % (res_fldr), scene_path_info)
def get_twin_rel_pose(traj_extMs, ref_indx, t_win_r, dat_indx_step, use_gt_R=False, use_gt_t=False, dataset=None, add_noise_gt=False, noise_sigmas=None, traj_extMs_dso=None, use_dso_R=False, use_dso_t=False, opt_next_frame=False): ''' Get the relative poses for the source frame in the local time window NOTE: For the last frame in the local time window, we will set its initial pose as the relative pose for t_win_r * dat_indx_step + ref_indx - 1, rather than t_win_r * dat_indx_step + ref_indx, assuming their poses are similar. ''' if use_dso_R or use_dso_t: assert traj_extMs_dso is not None if not opt_next_frame: src_frame_idx = [ idx for idx in range( ref_indx - t_win_r * dat_indx_step, ref_indx, dat_indx_step) ] \ + [ idx for idx in range( ref_indx + dat_indx_step, ref_indx + (t_win_r-1)*dat_indx_step+1, dat_indx_step) ] \ + [ t_win_r * dat_indx_step + ref_indx -1 ] src_frame_idx_opt = [ idx for idx in range( ref_indx - t_win_r * dat_indx_step, ref_indx, dat_indx_step) ] \ + [ idx for idx in range( ref_indx + dat_indx_step, ref_indx + t_win_r*dat_indx_step+1, dat_indx_step) ] else: src_frame_idx = [ idx for idx in range( ref_indx - t_win_r * dat_indx_step, ref_indx, dat_indx_step) ] \ + [ ref_indx + 1] \ + [ idx for idx in range( ref_indx + dat_indx_step, ref_indx + (t_win_r-1)*dat_indx_step+1, dat_indx_step) ] \ + [ t_win_r * dat_indx_step + ref_indx -1 ] src_frame_idx_opt = [ idx for idx in range( ref_indx - t_win_r * dat_indx_step, ref_indx, dat_indx_step) ] \ + [ ref_indx + 1] \ + [ idx for idx in range( ref_indx + dat_indx_step, ref_indx + t_win_r*dat_indx_step+1, dat_indx_step) ] ref_cam_extM = traj_extMs[ref_indx] src_cam_extMs = [traj_extMs[i] for i in src_frame_idx] if (isinstance(ref_cam_extM, torch.Tensor)): pass src_cam_poses = [ warp_homo.get_rel_extrinsicM(ref_cam_extM, src_cam_extM_) for src_cam_extM_ in src_cam_extMs ] src_cam_poses = [ torch.from_numpy(pose.astype(np.float32)) for pose in src_cam_poses ] # dso dr, dso dt # if traj_extMs_dso is not None: dRt = torch.FloatTensor( warp_homo.get_rel_extrinsicM( traj_extMs_dso[ref_indx].copy(), traj_extMs_dso[ref_indx + t_win_r * dat_indx_step].copy())) if use_dso_R: # we will use dso_R (traj_extMs was init. by DSO) src_cam_poses[-1][:3, :3] = dRt[:3, :3] if use_dso_t: # we will use dso_R (traj_extMs was init. by DSO) src_cam_poses[-1][:3, 3] = dRt[:3, 3] if use_gt_R or use_gt_t: for idx, srcidx in enumerate(src_frame_idx_opt): pose_gt = warp_homo.get_rel_extrinsicM(dataset[ref_indx]['extM'], dataset[srcidx]['extM']) R_gt = torch.from_numpy(pose_gt)[:3, :3] t_gt = torch.from_numpy(pose_gt)[:3, 3] if use_gt_R: print('USING GT R') if add_noise_gt: print('add noise to GT') R_gt += torch.randn( R_gt.shape).type_as(R_gt) * noise_sigmas[0] src_cam_poses[idx][:3, :3] = R_gt if use_gt_t: print('USING GT T') if add_noise_gt: print('add noise to GT') t_gt += torch.randn( t_gt.shape).type_as(t_gt) * noise_sigmas[1] src_cam_poses[idx][:3, 3] = t_gt return src_cam_poses, src_frame_idx_opt
def main(): import argparse print('Parsing the arguments...') parser = argparse.ArgumentParser() # exp name # parser.add_argument( '--exp_name', required=True, type=str, help='The name of the experiment. Used to naming the folders') # about testing # parser.add_argument('--model_path', type=str, required=True, help='The pre-trained model path for KV-net') parser.add_argument('--split_file', type=str, default=True, help='The split txt file') parser.add_argument('--frame_interv', default=5, type=int, help='frame interval') parser.add_argument('--t_win', type=int, default=2, help='The radius of the temporal window; default=2') parser.add_argument('--d_min', type=float, default=0, help='The minimal depth value; default=0') parser.add_argument('--d_max', type=float, default=5, help='The maximal depth value; default=15') parser.add_argument('--ndepth', type=int, default=64, help='The # of candidate depth values; default= 128') parser.add_argument('--sigma_soft_max', type=float, default=10., help='sigma_soft_max, default = 500.') parser.add_argument( '--feature_dim', type=int, default=64, help='The feature dimension for the feature extractor; default=64') # about dataset # parser.add_argument('--dataset', type=str, default='scanNet', help='Dataset name: {scanNet, 7scenes, kitti}') parser.add_argument('--dataset_path', type=str, default='.', help='Path to the dataset') parser.add_argument( '--change_aspect_ratio', type=bool, default=False, help= 'If we want to change the aspect ratio. This option is only useful for KITTI' ) # parsing parameters # args = parser.parse_args() exp_name = args.exp_name dataset_name = args.dataset t_win_r = args.t_win nDepth = args.ndepth d_candi = np.linspace(args.d_min, args.d_max, nDepth) sigma_soft_max = args.sigma_soft_max #10.#500. dnet_feature_dim = args.feature_dim frame_interv = args.frame_interv # should be multiple of 5 for scanNet dataset d_upsample = None d_candi_dmap_ref = d_candi nDepth_dmap_ref = nDepth split_file = args.split_file # ===== Dataset selection ======== # dataset_path = args.dataset_path if dataset_name == 'kitti': import mdataloader.kitti as dl_kitti dataset_init = dl_kitti.KITTI_dataset fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val') if not dataset_path == '.': fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val', database_path_base=dataset_path) else: # use default database path fun_get_paths = lambda traj_indx: dl_kitti.get_paths( traj_indx, split_txt=split_file, mode='val') if not args.change_aspect_ratio: # we will keep the aspect ratio and do cropping img_size = [768, 356] crop_w = None else: # we will change the aspect ratio and NOT do cropping img_size = [768, 356] crop_w = None n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) elif dataset_name == 'dm': import mdataloader.dm as dl_dm dataset_init = dl_dm.DMdataset split_file = './mdataloader/dm_split/dm_split.txt' if args.split_file == '.' else args.split_file fun_get_paths = lambda traj_indx: dl_dm.get_paths( traj_indx, split_txt=split_file, mode='val') if not dataset_path == '.': fun_get_paths = lambda traj_indx: dl_dm.get_paths( traj_indx, split_txt=split_file, mode='val', database_path_base=dataset_path) else: # use default database path fun_get_paths = lambda traj_indx: dl_dm.get_paths( traj_indx, split_txt=split_file, mode='val') if not args.change_aspect_ratio: # we will keep the aspect ratio and do cropping img_size = [786, 256] crop_w = None else: # we will change the aspect ratio and NOT do cropping img_size = [786, 256] crop_w = None n_scenes, _, _, _, _ = fun_get_paths(0) traj_Indx = np.arange(0, n_scenes) else: raise Exception('dataset loader not implemented') fldr_path, img_paths, dmap_paths, poses, intrin_path = fun_get_paths(0) if dataset_name == 'kitti': dataset = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.25, crop_w=crop_w) dataset_imgsize = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=1) else: dataset = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=.25) dataset_imgsize = dataset_init(True, img_paths, dmap_paths, poses, intrin_path=intrin_path, img_size=img_size, digitize=True, d_candi=d_candi_dmap_ref, resize_dmap=1) # ================================ # print('Initnializing the KV-Net') model_KVnet = m_kvnet.KVNET(feature_dim=dnet_feature_dim, cam_intrinsics=dataset.cam_intrinsics, d_candi=d_candi, sigma_soft_max=sigma_soft_max, KVNet_feature_dim=dnet_feature_dim, d_upsample_ratio_KV_net=d_upsample, t_win_r=t_win_r, if_refined=True) model_KVnet = torch.nn.DataParallel(model_KVnet) model_KVnet.cuda() model_path_KV = args.model_path print('loading KV_net at %s' % (model_path_KV)) utils_model.load_pretrained_model(model_KVnet, model_path_KV) print('Done') rmse, absrel, lg10, squarel, rmselog, D1, D2, D3 = 0, 0, 0, 0, 0, 0, 0, 0 for traj_idx in traj_Indx: res_fldr = '../results/%s/traj_%d' % (exp_name, traj_idx) m_misc.m_makedir(res_fldr) scene_path_info = [] print('Getting the paths for traj_%d' % (traj_idx)) fldr_path, img_seq_paths, dmap_seq_paths, poses, intrin_path = fun_get_paths( traj_idx) dataset.set_paths(img_seq_paths, dmap_seq_paths, poses) if dataset_name is 'scanNet': # For each trajector in the dataset, we will update the intrinsic matrix # dataset.get_cam_intrinsics(intrin_path) print('Done') dat_array = [dataset[idx] for idx in range(t_win_r * 2 + 1)] DMaps_meas = [] traj_length = len(dataset) print('trajectory length = %d' % (traj_length)) average_meter = export_res.AverageMeter() ### inference time torch.cuda.synchronize() start = time.time() for frame_cnt, ref_indx in enumerate( range(t_win_r, traj_length - t_win_r - 1)): result = export_res.Result() torch.cuda.synchronize() data_time = time.time() - start eff_iter = True valid_seq = check_datArray_pose(dat_array) # Read ref. and src. data in the local time window # ref_dat, src_dats = m_misc.split_frame_list(dat_array, t_win_r) if frame_cnt == 0: BVs_predict = None if valid_seq and eff_iter: # Get poses # src_cam_extMs = m_misc.get_entries_list_dict(src_dats, 'extM') src_cam_poses = \ [warp_homo.get_rel_extrinsicM(ref_dat['extM'], src_cam_extM_) \ for src_cam_extM_ in src_cam_extMs ] src_cam_poses = [ torch.from_numpy(pose.astype( np.float32)).cuda().unsqueeze(0) for pose in src_cam_poses ] # src_cam_poses size: N V 4 4 # src_cam_poses = torch.cat(src_cam_poses, dim=0).unsqueeze(0) src_frames = [m_misc.get_entries_list_dict(src_dats, 'img')] if frame_cnt == 0 or BVs_predict is None: # the first window for the traj. BVs_predict_in = None else: BVs_predict_in = BVs_predict # print('testing on %d/%d frame in traj %d/%d ... '%\ # (frame_cnt+1, traj_length - 2*t_win_r, traj_idx+1, len(traj_Indx)) ) torch.cuda.synchronize() gpu_time = time.time() - start # set trace for specific frame # BVs_measure, BVs_predict = test_KVNet.test( model_KVnet, d_candi, Ref_Dats=[ref_dat], Src_Dats=[src_dats], Cam_Intrinsics=[dataset.cam_intrinsics], t_win_r=t_win_r, Src_CamPoses=src_cam_poses, BV_predict=BVs_predict_in, R_net=True, Cam_Intrinsics_imgsize=dataset_imgsize.cam_intrinsics, ref_indx=ref_indx) pred_depth, gt = export_res.do_evaluation( ref_dat, BVs_measure, d_candi_dmap_ref) # print(pred_depth.shape, gt.shape) result.evaluate(pred_depth.data, gt.data) average_meter.update(result, gpu_time, data_time, (traj_length - 2 * t_win_r)) scene_path_info.append( [frame_cnt, dataset[ref_indx]['img_path']]) elif valid_seq is False: # if the sequence contains invalid pose estimation BVs_predict = None print('frame_cnt :%d, include invalid poses' % (frame_cnt)) elif eff_iter is False: BVs_predict = None # Update dat_array # dat_array.pop(0) dat_array.append(dataset[ref_indx + t_win_r + 1]) avg = average_meter.average() print('\n*\n' 'RMSE={average.rmse:.3f}\n' 'AbsRel={average.absrel:.3f}\n' 'Log10={average.lg10:.3f}\n' 'SquaRel={average.squarel:.3f}\n' 'rmselog={average.rmselog:.3f}\n' 'Delta1={average.delta1:.3f}\n' 'Delta2={average.delta2:.3f}\n' 'Delta3={average.delta3:.3f}\n' 't_GPU={time:.3f}\n'.format(average=avg, time=avg.gpu_time)) ### inference time torch.cuda.synchronize() end = time.time() rmse += avg.rmse absrel += avg.absrel lg10 += avg.lg10 squarel += avg.squarel rmselog += avg.rmselog D1 += avg.delta1 D2 += avg.delta2 D3 += avg.delta3 print('rmse={%.3f}\n' % (rmse / (traj_idx + 1)), 'absrel={%.3f}\n' % (absrel / (traj_idx + 1)), 'lg10={%.3f}\n' % (lg10 / (traj_idx + 1)), 'squarel={%.3f}\n' % (squarel / (traj_idx + 1)), 'rmselog={%.3f}\n' % (rmselog / (traj_idx + 1)), 'D1={%.3f}\n' % (D1 / (traj_idx + 1)), 'D2={%.3f}\n' % (D2 / (traj_idx + 1)), 'D3={%.3f}\n' % (D3 / (traj_idx + 1))) print((end - start) / (traj_length - 2 * t_win_r)) m_misc.save_ScenePathInfo('%s/scene_path_info.txt' % (res_fldr), scene_path_info)