def evaluate(opt): """Evaluate odometry on the KITTI dataset """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10" or opt.eval_split == "odom_0", \ "eval_split should be either odom_9 or odom_10" sequence_id = int(opt.eval_split.split("_")[1]) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) depth_encoder = networks.ResnetEncoder(opt.num_layers, False) depth_encoder_dict = torch.load(depth_encoder_path) model_dict = depth_encoder.state_dict() depth_encoder.load_state_dict( {k: v for k, v in depth_encoder_dict.items() if k in model_dict}) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc) depth_decoder.load_state_dict(torch.load(depth_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() depth_encoder.cuda() depth_encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_poses = [] pred_disps = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: input_color = inputs[("color", 0, 0)].cuda() depth_output = depth_decoder(depth_encoder(input_color)) pred_disp, _ = disp_to_depth(depth_output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) pred_disps = np.concatenate(pred_disps) pred_poses_scaled = [] ratios_d = [] gt_norms_div = [] gt_norms = [] pred_norms = [] td_divs_dgc = [] poses_pred = [] for i in range(pred_poses.shape[0]): pred_pose = pred_poses[i] pred_disp = pred_disps[i + 1] pred_depth = 1 / pred_disp scale_recovery = ScaleRecovery(1, 192, 640, K).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio = scale_recovery(pred_depth).cpu().item() pred_pose_scaled = pred_pose[:3, 3] * ratio poses_pred.append(pred_pose[:3, 3]) pred_poses_scaled.append(pred_pose_scaled) ratios_d.append(ratio) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 5 for i in range(0, num_frames - 1): local_xyzs = np.array( dump_xyz(pred_poses_scaled[i:i + track_length - 1])) gt_local_xyzs = np.array( dump_xyz(gt_local_poses[i:i + track_length - 1])) gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm( local_xyzs) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) gt_norms_div.append(gt_norm_div) gt_norms.append(np.linalg.norm(gt_local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(os.path.dirname(__file__), "poses_scaled{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_gt{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_pred{:02d}.npy".format(sequence_id)) np.save(save_path, gt_xyzs) save_path = os.path.join(os.path.dirname(__file__), "gt_norms{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms) save_path = os.path.join(os.path.dirname(__file__), "gt_norms_div{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms_div) save_path = os.path.join(os.path.dirname(__file__), "ratios_d{:02d}.npy".format(sequence_id)) np.save(save_path, ratios_d) save_path = os.path.join(os.path.dirname(__file__), "pred_norms{:02d}.npy".format(sequence_id)) np.save(save_path, pred_norms) print("-> Predictions saved to", save_path)
def __init__(self, options): #pdb.set_trace() self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") test_path = os.path.join(os.path.dirname(__file__), "splits", 'eigen_benchmark', "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) test_filenames = readlines(test_path.format("test")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, self.num_scales, is_train=True, img_ext=img_ext, is_flow=True, args=self.opt) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, self.num_scales, is_train=False, img_ext=img_ext, is_flow=True, args=self.opt) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) vid_dataset_val = self.dataset(self.opt.data_path, sorted(val_filenames), self.opt.height, self.opt.width, self.opt.frame_ids, self.num_scales, is_train=False, img_ext=img_ext, is_flow=False, args=self.opt) vid_dataset_test = self.dataset(self.opt.data_path, sorted(test_filenames), self.opt.height, self.opt.width, self.opt.frame_ids, self.num_scales, is_train=False, img_ext=img_ext, is_flow=False, args=self.opt) self.vid_loader_val = DataLoader(vid_dataset_val, 1, False, num_workers=0, pin_memory=True, drop_last=True) self.vid_loader_test = DataLoader(vid_dataset_test, 1, False, num_workers=0, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path, map_location=torch.device("cuda:0")) dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False) # dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, # pin_memory=True, drop_last=False) dataloader = DataLoader( dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False, collate_fn=my_collate_fn ) ## the default collate_fn will fail because there are non-deterministic length sample encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict( torch.load(decoder_path, map_location=torch.device("cuda:0"))) encoder.cuda(0) encoder.eval() depth_decoder.cuda(0) depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda(0) if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print( "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done." ) quit() # gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") # gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths_im_ori.npz") ## ZMH: use the gt produced by vel_depth=False in generate_depth_map_original gt_path = os.path.join( splits_dir, opt.eval_split, "gt_depths_im_cus.npz" ) ## ZMH: use the gt produced by vel_depth=False in generate_depth_map_original ## ZMH: gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] # gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1')["data"] print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format( STEREO_SCALE_FACTOR)) opt.disable_median_scaling = True opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if opt.eval_split == "eigen": mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array([ 0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) else: mask = gt_depth > 0 pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= opt.pred_depth_scale_factor if not opt.disable_median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH errors.append(compute_errors(gt_depth, pred_depth)) if not opt.disable_median_scaling: ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, tag=opt.dataset) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset( opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0,'s'], 4, tag='kitti', is_train=False, img_ext='png', load_meta=False, is_load_semantics=True, is_predicted_semantics=True, load_morphed_depth=False) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() sfx = torch.nn.Softmax(dim=1) print("Evaluation starts") confMatrix = generateMatrix(args) nbPixels = 0 count255 = 0 with torch.no_grad(): for idx, inputs in enumerate(dataloader): input_color = inputs[("color", 0, 0)].cuda() outputs = depth_decoder(encoder(input_color),computeSemantic = True, computeDepth = False) gt = inputs['seman_gt_eval'].cpu().numpy().astype(np.uint8) pred = sfx(outputs[('seman', 0)]).detach() pred = torch.argmax(pred, dim=1).type(torch.float).unsqueeze(1) pred = F.interpolate(pred, [gt.shape[1], gt.shape[2]], mode='nearest') pred = pred.squeeze(1).cpu().numpy().astype(np.uint8) # visualize_semantic(gt[0,:,:]).show() # visualize_semantic(pred[0,:,:]).show() groundTruthNp = gt predictionNp = pred nbPixels = nbPixels + groundTruthNp.shape[0] * groundTruthNp.shape[1] * groundTruthNp.shape[2] # encoding_value = max(groundTruthNp.max(), predictionNp.max()).astype(np.int32) + 1 encoding_value = 256 # precomputed encoded = (groundTruthNp.astype(np.int32) * encoding_value) + predictionNp values, cnt = np.unique(encoded, return_counts=True) for value, c in zip(values, cnt): pred_id = value % encoding_value gt_id = int((value - pred_id) / encoding_value) if pred_id == 255 or gt_id == 255: count255 = count255 + c continue if not gt_id in args.evalLabels: printError("Unknown label with id {:}".format(gt_id)) confMatrix[gt_id][pred_id] += c print("Finish %dth batch" % idx) if confMatrix.sum() + count255 != nbPixels: printError( 'Number of analyzed pixels and entries in confusion matrix disagree: contMatrix {}, pixels {}'.format( confMatrix.sum(), nbPixels)) classScoreList = {} for label in args.evalLabels: labelName = trainId2label[label].name classScoreList[labelName] = getIouScoreForLabel(label, confMatrix, args) vals = np.array(list(classScoreList.values())) mIOU = np.mean(vals[np.logical_not(np.isnan(vals))]) # if opt.save_pred_disps: # output_path = os.path.join( # opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) # print("-> Saving predicted disparities to ", output_path) # np.save(output_path, pred_disps) print("mIOU is %f" % mIOU)
def test_simple(args): """Function to predict for a single image or folder of images """ print(args.image_path) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") print(args.model_path) #download_model_if_doesnt_exist(args.model_path,args.model_name) model_path = os.path.join(args.model_path, args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") #1 LOADING PRETRAINED MODEL #1.1 encoder print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() #1.2 decoder print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() #2. FINDING INPUT IMAGES in_path = Path(args.image_path) if args.out_path != None: out_path = Path(args.out_path) else: out_path = Path('./' + in_path.stem + '_out') out_path.mkdir_p() #3. PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for image_path in tqdm(in_path.files()): # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) #torch.Size([1, 3, 192, 640]) features = encoder(input_image) #a list from 0 to 4 outputs = depth_decoder(features) # dict , 4 disptensor # disp = outputs[("disp", 0,0)]# has a same size with input disp = outputs[("disp", 0)] # has a same size with input disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = image_path.stem if args.npy_out: name_dest_npy = os.path.join(out_path, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) name_dest_im = os.path.join(out_path, "{}_disp.png".format(output_name)) plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax) print('-> Done!')
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.5, 0, 0.5, 0], [0, 1.656, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) dataset = datasets.AirSimDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False) dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] gt_depths = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda() if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.eval_object: object_masks = [] for line in filenames: line = line.split() folder, frame_index = line[0], int(line[1]) object_mask_filename = os.path.join( os.path.dirname(__file__), "object_masks", folder, "{:010d}.npy".format(int(frame_index))) object_mask = np.load(object_mask_filename) object_masks.append(object_mask) if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print( "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done." ) quit() gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format( STEREO_SCALE_FACTOR)) opt.scaling = "disable" opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] ratios_dgc = [] ex_logs = [] mean_scale = [] side_map = {"2": 2, "3": 3, "l": 2, "r": 3} #resize_ori = transforms.Resize((pred_disps.shape[1],pred_disps.shape[2]),interpolation=Image.ANTIALIAS) for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] line = filenames[i].split() folder = line[0] frame_index = line[1] side = side_map[line[2]] color = pil_loader(get_image_path(folder, int(frame_index), side)) pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if (opt.eval_split == "eigen") | (opt.eval_split == "AirSim"): mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) ''' crop = np.array( [0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) if opt.eval_object: object_mask = object_masks[i].astype(np.bool) ''' else: mask = gt_depth > 0 if opt.scaling == "gt": ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask]) ratios.append(ratio) if opt.eval_object: mask = np.logical_and(mask, object_mask) #elif opt.scaling == "dgc": scale_recovery = ScaleRecovery(1, gt_height, gt_width, K).cuda() #scale_recovery = ScaleRecovery(1, 192, 640, K).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio1 = scale_recovery(pred_depth) ratio = ratio1.cpu().item() ratios_dgc.append(ratio) pred_depth = pred_depth[0].cpu().numpy() ''' surface_normal = surface_normal1.cpu()[0,:,:,:].numpy() ground_mask = ground_mask1.cpu()[0,0,:,:].numpy() pred_depth = pred_depth[0].cpu().numpy() if i==28: np.save('pred_disp28.npy',pred_disp) np.save('surface_normal28.npy',surface_normal) np.save('ground_mask28.npy',ground_mask) print(np.min(pred_depth),np.max(pred_depth),ratio) ''' else: ratio = 1 #print(ratio) #print(max(pred_depth)) #print(min(pred_depth)) pred_depth_ori = pred_depth * mask gt_depth_ori = gt_depth * mask pred_depth_ori = np.where(mask == 1, pred_depth_ori, 1) pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] mean_scale.append(np.mean(gt_depth / pred_depth)) error_try = 100 scale_abs = 0 for ratio_try in np.arange(0.1, 50, step=0.1): pred_depth1 = pred_depth * ratio_try error_tmp = compute_errors(gt_depth, pred_depth1)[0] #print(error_tmp) if error_tmp < error_try: error_try = error_tmp scale_abs = ratio_try ex_logs.append(scale_abs) div_scale = gt_depth_ori / pred_depth_ori #print(div_scale.shape) div_values1 = div_scale[mask] div_scale = (div_scale - scale_abs) / scale_abs div_values = div_scale[mask] #div_rmse = sqrt(sum((div_values1-scale_abs)*(div_values1-scale_abs))/len(div_values1)) print("min,max value of div_values no abs is", min(div_values), max(div_values)) #ex_logs.append([i,min(div_values), max(div_values), div_rmse,scale_abs]) #print(div_scale.shape) #div_scale = div_scale/np.max(div_scale) mu = np.mean(div_values1) sigma = np.std(div_values1) print("min,max of div_values1 is", min(div_values1), max(div_values1)) fig, ax = plt.subplots() n, bins, patches = ax.hist(div_values1, 150, range=(3, 130), density=True) y = norm.pdf(bins, mu, sigma) ax.plot(bins, y, 'r') plt.xlabel('Scale') plt.ylabel('Density') plt.savefig( os.path.join(os.path.dirname(__file__), "hist_imgs_AirSim", "{}.jpg".format(i))) plt.close() blending_imgs(div_scale, color, i, mask) pred_depth *= ratio ratios.append(ratio) pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH #blending_imgs(div_scale, color,i,mask) if len(gt_depth) != 0: errors.append(compute_errors(gt_depth, pred_depth)) ratios_dgc = np.array(ratios_dgc) ratios = np.array(ratios) np.save('ideal_scale_AirSim.npy', ex_logs) np.save('median_raitos_AirSim.npy', ratios) np.save('dgc_raitos_AirSim.npy', ratios_dgc) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def test_simple_inputs(image_path, model_name, output_path, cuda_is_available): """Function to predict for a single image or folder of images """ assert model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if cuda_is_available: device = torch.device("cuda") else: device = torch.device("cpu") #download_model_if_doesnt_exist(model_name) model_path = os.path.join("models", model_name) #print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL #print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() #print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(image_path): # Only testing on a single image paths = [image_path] #output_directory = os.path.dirname(image_path) output_directory = os.path.dirname(output_path) elif os.path.isdir(image_path): # Searching folder for images paths = glob.glob(os.path.join(image_path, '*.{}'.format('.jpg'))) output_directory = image_path else: raise Exception("Can not find args.image_path: {}".format(image_path)) #print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.use_stereo: opt.frame_ids.append("s") if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], opt.frame_ids, 4, is_train=False, tag=opt.dataset, load_meta=True, is_sep_train_seman=False) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], opt.frame_ids, 4, is_train=False, tag=opt.dataset) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() # x = torch.ones(2, 2, requires_grad=True) # print(x) # y = x + 2 + x # y = y.detach() # print(y) # z = y * y * 3 # out = z.mean() # print(z, out) # out.backward() # print(x.grad) ##--------------------Visualization parameter here----------------------------## sfx = torch.nn.Softmax(dim=1) mergeDisp = Merge_MultDisp(opt.scales, batchSize=opt.batch_size, isMulChannel=opt.isMulChannel) svRoot = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/figure_visual' index = 0 isvisualize = True viewEdgeMerge = False isHist = False useGtSeman = True viewSurfaceNormal = True viewSelfOcclu = True viewDispUp = True viewSmooth = True viewMulReg = True viewBorderRegress = False viewBorderSimilarity = False viewRandomSample = True viewSemanReg = False viewDepthGuess = False height = 256 width = 512 tensor23dPts = Tensor23dPts() if isHist: rec = np.zeros((19, 100)) if opt.isMulChannel: app = os.path.join('mulDispOn', opt.model_name) else: app = os.path.join('mulDispOff', opt.model_name) dirpath = os.path.join(svRoot, app) if not os.path.exists(dirpath): os.makedirs(dirpath) if viewSmooth: comSmooth = ComputeSmoothLoss().cuda() if viewEdgeMerge: comp1dgrad = Comp1dgrad().cuda() if viewSurfaceNormal: compsn = ComputeSurfaceNormal(height=height, width=width, batch_size=opt.batch_size).cuda() if viewSelfOcclu: selfclu = SelfOccluMask().cuda() if viewDispUp: compDispUp = ComputeDispUpLoss().cuda() if viewMulReg: objReg = ObjRegularization() objReg.cuda() if viewBorderRegress: borderRegress = BorderRegression() borderRegress.cuda() if viewRandomSample: rdSampleOnBorder = RandomSampleNeighbourPts() rdSampleOnBorder.cuda() if viewSemanReg: rdSampleSeman = RandomSampleBorderSemanPts() rdSampleSeman.cuda() if viewDepthGuess: depthGuess = DepthGuessesBySemantics(batchNum=opt.batch_size, width=width, height=height) depthGuess.cuda() # if viewBorderSimilarity: # borderSim = BorderSimilarity() # borderSim.cuda() with torch.no_grad(): for idx, inputs in enumerate(dataloader): # if idx != 12: # continue for key, ipt in inputs.items(): if not (key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta'): inputs[key] = ipt.to(torch.device("cuda")) input_color = inputs[("color", 0, 0)].cuda() # input_color = torch.flip(input_color, dims=[3]) features = encoder(input_color) outputs = dict() outputs.update( depth_decoder(features, computeSemantic=True, computeDepth=False)) outputs.update( depth_decoder(features, computeSemantic=False, computeDepth=True)) # view the processed semantic seperate training data # for viewInd in range(opt.batch_size): # label = inputs['semanTrain_label'] # visualize_semantic(label[viewInd, 0, :, :].cpu().numpy()).show() # fig_rgb = inputs['semanTrain_rgb'][viewInd, :, :, :].permute(1, 2, 0).cpu().numpy() # fig_rgb = (fig_rgb * 255).astype(np.uint8) # fig_rgb = pil.fromarray(fig_rgb) # fig_rgb.show() if isHist: mulDisp = outputs[('mul_disp', 0)] scaled_disp, mulDepth = disp_to_depth(mulDisp, 0.1, 100) mulDepth = mulDepth.cpu() for i in range(mulDisp.shape[1]): rec[i, :] += torch.histc(mulDepth[:, i, :, :], bins=100, min=0, max=100).numpy() if isvisualize: if useGtSeman: # outputs[('mul_disp', 0)][:,2,:,:] = outputs[('mul_disp', 0)][:,2,:,:] * 0 # outputs[('mul_disp', 0)][:, 12, :, :] = outputs[('mul_disp', 0)][:, 12, :, :] * 0 mergeDisp(inputs, outputs, eval=False) else: mergeDisp(inputs, outputs, eval=True) dispMap = outputs[('disp', 0)] scaled_disp, depthMap = disp_to_depth(dispMap, 0.1, 100) depthMap = depthMap * STEREO_SCALE_FACTOR # _, mul_depthMap = disp_to_depth(outputs[('mul_disp', 0)], 0.1, 100) # mul_depthMap = mul_depthMap * STEREO_SCALE_FACTOR if viewDispUp: fig_dispup = compDispUp.visualize(scaled_disp, viewindex=index) if viewSmooth: rgb = inputs[('color_aug', 0, 0)] smoothfig = comSmooth.visualize(rgb=rgb, disp=scaled_disp, viewindex=index) if useGtSeman: fig_seman = tensor2semantic(inputs['seman_gt'], ind=index, isGt=True) else: fig_seman = tensor2semantic(outputs[('seman', 0)], ind=index) if viewSemanReg: foregroundType = [ 11, 12, 13, 14, 15, 16, 17, 18 ] # person, rider, car, truck, bus, train, motorcycle, bicycle softmaxedSeman = F.softmax(outputs[('seman', 0)], dim=1) forePredMask = torch.sum( softmaxedSeman[:, foregroundType, :, :], dim=1, keepdim=True) foreGtMask = torch.ones(dispMap.shape).cuda().byte() for m in foregroundType: foreGtMask = foreGtMask * (inputs['seman_gt'] != m) foreGtMask = 1 - foreGtMask foreGtMask = foreGtMask.float() forePredMask[forePredMask > 0.5] = 1 forePredMask[forePredMask <= 0.5] = 0 forePredMask = foreGtMask rdSampleSeman.visualizeBorderSample(dispMap, forePredMask, gtMask=foreGtMask, viewIndex=index) cm = plt.get_cmap('magma') viewForePred = forePredMask[index, :, :, :].squeeze( 0).detach().cpu().numpy() viewForePred = (cm(viewForePred) * 255).astype(np.uint8) # pil.fromarray(viewForePred).show() viewForeGt = foreGtMask[index, :, :, :].squeeze( 0).detach().cpu().numpy() viewForeGt = (cm(viewForeGt) * 255).astype(np.uint8) # pil.fromarray(viewForeGt).show() forePredictCombined = np.concatenate( [viewForePred, viewForeGt], axis=0) # pil.fromarray(forePredictCombined).show() pil.fromarray(forePredictCombined).save( os.path.join(dirpath, str(idx) + '_fg.png')) if viewDepthGuess: wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle wallTypeMask = torch.ones(dispMap.shape).cuda().byte() roadTypeMask = torch.ones(dispMap.shape).cuda().byte() foreGroundMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in wallType: wallTypeMask = wallTypeMask * (inputs['seman_gt'] != m) wallTypeMask = (1 - wallTypeMask).float() for m in roadType: roadTypeMask = roadTypeMask * (inputs['seman_gt'] != m) roadTypeMask = (1 - roadTypeMask).float() for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = (1 - foreGroundMask).float() originalSieze = [2048, 1024] # currentSize = np.array([dispMap.shape[3], dispMap.shape[2]]) # scaleFac = np.eye(4) # scaleFac[0,0] = currentSize[0] / originalSieze[0] # scaleFac[1,1] = currentSize[1] / originalSieze[1] # scaleFac = torch.Tensor(scaleFac).view(1,4,4).repeat(opt.batch_size, 1, 1).cuda() # scaledIntrinsic = scaleFac @ inputs['realIn'] scaledIntrinsic = inputs['realIn'] depthGuess.visualizeDepthGuess( realDepth=depthMap, dispAct=dispMap, foredgroundMask=foreGroundMask, wallTypeMask=wallTypeMask, groundTypeMask=roadTypeMask, intrinsic=scaledIntrinsic, extrinsic=inputs['realEx'], semantic=inputs['seman_gt_eval'], cts_meta=inputs['cts_meta'], viewInd=index) # realDepth, foredgroundMask, wallTypeMask, groundTypeMask, intrinsic, extrinsic fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=index) fig_disp = tensor2disp(outputs[('disp', 0)], ind=index) fig_3d, veh_coord, veh_coord_gt = tensor23dPts.visualize3d( depthMap, ind=index, intrinsic=inputs['cts_meta']['intrinsic'][index, :, :], extrinsic=inputs['cts_meta']['extrinsic'][index, :, :], gtmask=inputs['cts_meta']['mask'][index, :, :], gtdepth=inputs['cts_meta']['depthMap'][index, :, :], semanticMap=inputs['seman_gt_eval'][index, :, :]) # check: # torch.inverse(inputs['invcamK'][index, :, :] @ inputs['realIn'][index, :, :]) - inputs['cts_meta']['extrinsic'][index, :, :] fig_grad = None if viewSurfaceNormal: # surnorm = compsn.visualize(depthMap = depthMap, invcamK = inputs['invcamK'].cuda(), orgEstPts = veh_coord, gtEstPts = veh_coord_gt, viewindex = index) surnorm = compsn.visualize( depthMap=depthMap, invcamK=inputs['invcamK'].cuda(), orgEstPts=veh_coord, gtEstPts=veh_coord_gt, viewindex=index) surnormMap = compsn(depthMap=depthMap, invcamK=inputs['invcamK'].cuda()) if viewMulReg: depthMapLoc = depthMap / STEREO_SCALE_FACTOR skyId = 10 skyMask = inputs['seman_gt'] == skyId skyerr = objReg.visualize_regularizeSky(depthMapLoc, skyMask, viewInd=index) wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain permuType = [5, 7] # Pole, traffic sign chanWinSize = 5 wallMask = torch.ones_like(skyMask) roadMask = torch.ones_like(skyMask) permuMask = torch.ones_like(skyMask) with torch.no_grad(): for m in wallType: wallMask = wallMask * (inputs['seman_gt'] != m) wallMask = 1 - wallMask wallMask = wallMask[:, :, 1:-1, 1:-1] for m in roadType: roadMask = roadMask * (inputs['seman_gt'] != m) roadMask = 1 - roadMask roadMask = roadMask[:, :, 1:-1, 1:-1] for m in permuType: permuMask = permuMask * (inputs['seman_gt'] != m) permuMask = 1 - permuMask permuMask = permuMask[:, :, 1:-1, 1:-1] BdErrFig, viewRdErrFig = objReg.visualize_regularizeBuildingRoad( surnormMap, wallMask, roadMask, dispMap, viewInd=index) padSize = int((chanWinSize - 1) / 2) permuMask = permuMask[:, :, padSize:-padSize, padSize:-padSize] surVarFig = objReg.visualize_regularizePoleSign( surnormMap, permuMask, dispMap, viewInd=index) if viewBorderRegress: foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [ 0, 1, 2, 3, 4, 8, 9, 10 ] # road, sidewalk, building, wall, fence, vegetation, terrain, sky suppressType = [255] # Suppress no label lines # foreGroundMask = torch.sum(inputs['seman_gt'][:, foregroundType, :, :], dim=1, keepdim=True) # backGroundMask = torch.sum(inputs['seman_gt'][:, backgroundType, :, :], dim=1, keepdim=True) foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() suppresMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in backgroundType: backGroundMask = backGroundMask * ( inputs['seman_gt'] != m) backGroundMask = 1 - backGroundMask for m in suppressType: suppresMask = suppresMask * (inputs['seman_gt'] != m) suppresMask = 1 - suppresMask suppresMask = suppresMask.float() combinedMask = torch.cat( [foreGroundMask, backGroundMask], dim=1).float() # borderRegFig = borderRegress.visualize_computeBorder(dispMap, combinedMask, suppresMask = suppresMask, viewIndex=index) borderRegFig = None else: borderRegFig = None # if viewBorderSimilarity: # foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, # 18] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle # backgroundType = [0, 1, 2, 3, 4, 8, 9, # 10] # road, sidewalk, building, wall, fence, vegetation, terrain, sky # suppressType = [255] # Suppress no label lines # foreGroundMask = torch.ones(dispMap.shape).cuda().byte() # backGroundMask = torch.ones(dispMap.shape).cuda().byte() # suppresMask = torch.ones(dispMap.shape).cuda().byte() # # with torch.no_grad(): # for m in foregroundType: # foreGroundMask = foreGroundMask * (inputs['seman_gt'] != m) # foreGroundMask = 1 - foreGroundMask # for m in backgroundType: # backGroundMask = backGroundMask * (inputs['seman_gt'] != m) # backGroundMask = 1 - backGroundMask # for m in suppressType: # suppresMask = suppresMask * (inputs['seman_gt'] != m) # suppresMask = 1 - suppresMask # suppresMask = suppresMask.float() # combinedMask = torch.cat([foreGroundMask, backGroundMask], dim=1).float() # # borderSimFig = borderSim.visualize_borderSimilarity(dispMap, foreGroundMask.float(), suppresMask = suppresMask, viewIndex=index) if viewRandomSample: foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [ 0, 1, 2, 3, 4, 8, 9, 10 ] # road, sidewalk, building, wall, fence, vegetation, terrain, sky suppressType = [255] # Suppress no label lines foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() suppresMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in suppressType: suppresMask = suppresMask * (inputs['seman_gt'] != m) suppresMask = 1 - suppresMask suppresMask = suppresMask.float() foreGroundMask = foreGroundMask.float() rdSampleOnBorder.visualize_randomSample(dispMap, foreGroundMask, suppresMask, viewIndex=index) # rdSampleOnBorder.randomSampleReg(dispMap, foreGroundMask) if viewEdgeMerge: grad_disp = comp1dgrad(outputs[('mul_disp', 0)]) fig_grad = tensor2disp(grad_disp, ind=index, vmax=1) fig_grad = fig_grad.resize([512, 256]) if viewSelfOcclu: fl = inputs[("K", 0)][:, 0, 0] bs = torch.abs(inputs["stereo_T"][:, 0, 3]) clufig, suppressedDisp = selfclu.visualize(dispMap, viewind=index) if fig_grad is not None: grad_seman = ( np.array(fig_grad)[:, :, 0:3].astype(np.float) * 0.7 + np.array(fig_seman).astype(np.float) * 0.3).astype( np.uint8) # combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_grad)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)] combined = [ grad_seman, np.array(fig_disp)[:, :, 0:3], np.array(fig_rgb) ] combined = np.concatenate(combined, axis=1) else: if viewSurfaceNormal and viewSelfOcclu: surnorm = surnorm.resize([512, 256]) surnorm_mixed = pil.fromarray( (np.array(surnorm) * 0.2 + np.array(fig_disp)[:, :, 0:3] * 0.8).astype( np.uint8)) disp_seman = ( np.array(fig_disp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) supprressed_disp_seman = ( np.array(suppressedDisp)[:, :, 0:3].astype( np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) rgb_seman = ( np.array(fig_seman).astype(np.float) * 0.5 + np.array(fig_rgb).astype(np.float) * 0.5).astype( np.uint8) # clud_disp = (np.array(clufig)[:, :, 0:3].astype(np.float) * 0.3 + np.array(fig_disp)[:, :, 0:3].astype( # np.float) * 0.7).astype(np.uint8) comb1 = np.concatenate([ np.array(supprressed_disp_seman)[:, :, 0:3], np.array(suppressedDisp)[:, :, 0:3] ], axis=1) comb2 = np.concatenate([ np.array(disp_seman)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3] ], axis=1) comb3 = np.concatenate([ np.array(surnorm_mixed)[:, :, 0:3], np.array(surnorm)[:, :, 0:3] ], axis=1) comb4 = np.concatenate([ np.array(fig_seman)[:, :, 0:3], np.array(rgb_seman)[:, :, 0:3] ], axis=1) comb6 = np.concatenate([ np.array(clufig)[:, :, 0:3], np.array(fig_dispup)[:, :, 0:3] ], axis=1) fig3dsize = np.ceil( np.array([ comb4.shape[1], comb4.shape[1] / fig_3d.size[0] * fig_3d.size[1] ])).astype(np.int) comb5 = np.array(fig_3d.resize(fig3dsize)) # combined = np.concatenate([comb1, comb6, comb2, comb3, comb4, comb5], axis=0) combined = np.concatenate([comb1, comb2, comb4, comb3], axis=0) else: disp_seman = ( np.array(fig_disp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) rgb_seman = ( np.array(fig_seman).astype(np.float) * 0.5 + np.array(fig_rgb).astype(np.float) * 0.5).astype( np.uint8) # combined = [np.array(disp_seman)[:,:,0:3], np.array(fig_disp)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)] combined = [ np.array(disp_seman)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3], np.array(fig_seman), np.array(rgb_seman) ] combined = np.concatenate(combined, axis=1) fig = pil.fromarray(combined) # fig.show() fig.save(os.path.join(dirpath, str(idx) + '.png')) if borderRegFig is not None: borderRegFig.save( os.path.join(dirpath, str(idx) + '_borderRegress.png')) # fig_3d.save(os.path.join(dirpath, str(idx) + '_fig3d.png')) # for k in range(10): # fig_disp = tensor2disp(outputs[('disp', 0)], ind=k) # fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=k) # combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_rgb)] # combined = np.concatenate(combined, axis=1) # fig = pil.fromarray(combined) # fig.save( # os.path.join('/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/MoredispOrg' + str(k) + '.png')) # fig_rgb.save(os.path.join(svRoot, app, 'rgb' + str(idx) + '.png')) # fig_seman.save(os.path.join(svRoot, app, 'semantic'+ str(idx) + '.png')) # fig_disp.save(os.path.join(svRoot, app, 'disp'+ str(idx) + '.png')) # a = inputs['seman_gt_eval'] # scaled_disp, _ = disp_to_depth(outputs[('disp', 0)], 0.1, 100) print("%dth saved" % idx) # If compute the histogram if isHist: svPath = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/mul_channel_depth' carId = 13 prob = copy.deepcopy(rec) ind = np.arange(prob.shape[1] * 2) for i in range(prob.shape[0]): prob[i, :] = prob[i, :] / np.sum(prob[i, :]) for i in range(prob.shape[0]): trainStr = trainId2label[i][0] fig, ax = plt.subplots() rects1 = ax.bar(ind[0::2], prob[carId, :], label='obj:car') rects2 = ax.bar(ind[1::2], prob[i, :], label='obj:' + trainStr) ax.set_ylabel('Meter in percentile') ax.set_xlabel('Meters') ax.set_title('Scale Changes between scale car and scale %s' % trainStr) ax.legend() plt.savefig(os.path.join(svPath, str(i)), dpi=200) plt.close(fig)
def main(): global args checkpoint = None is_eval = False if args.evaluate: args_new = args if os.path.isfile(args.evaluate): print("=> loading checkpoint '{}' ... ".format(args.evaluate), end='') checkpoint = torch.load(args.evaluate, map_location=device) args = checkpoint['args'] args.data_folder = args_new.data_folder args.val = args_new.val is_eval = True print("Completed.") else: print("No model found at '{}'".format(args.evaluate)) return elif args.resume: # optionally resume from a checkpoint args_new = args if os.path.isfile(args.resume): print("=> loading checkpoint '{}' ... ".format(args.resume), end='') checkpoint = torch.load(args.resume, map_location=device) args.start_epoch = checkpoint['epoch'] + 1 args.data_folder = args_new.data_folder args.val = args_new.val print("Completed. Resuming from epoch {}.".format( checkpoint['epoch'])) else: print("No checkpoint found at '{}'".format(args.resume)) return ################# model print("=> creating model and optimizer ... ", end='') parameters_to_train = [] encoder = networks.ResnetEncoder(num_layers=18) encoder.to(device) parameters_to_train += list(encoder.parameters()) decoder = networks.DepthDecoder(encoder.num_ch_enc) decoder.to(device) parameters_to_train += list(decoder.parameters()) # encoder_named_params = [ # p for _, p in encoder.named_parameters() if p.requires_grad # ] optimizer = torch.optim.Adam(parameters_to_train, lr=args.lr, weight_decay=args.weight_decay) encoder = torch.nn.DataParallel(encoder) decoder = torch.nn.DataParallel(decoder) model = [encoder, decoder] print("completed.") # if checkpoint is not None: # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> checkpoint state loaded.") # Data loading code print("=> creating data loaders ... ") if not is_eval: train_dataset = KittiDepth('train', args) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) print("\t==> train_loader size:{}".format(len(train_loader))) val_dataset = KittiDepth('val', args) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=12, #1 shuffle=False, num_workers=2, pin_memory=True) # set batch size to be 1 for validation print("\t==> val_loader size:{}".format(len(val_loader))) ############################################################## # create backups and results folder logger = helper.logger(args) # if checkpoint is not None: # logger.best_result = checkpoint['best_result'] print("=> logger created.") if is_eval: print("=> starting model evaluation ...") result, is_best = iterate("val", args, val_loader, model, None, logger, checkpoint['epoch']) return # main loop print("=> starting main loop ...") for epoch in range(args.start_epoch, args.epochs): print("=> starting training epoch {} ..".format(epoch)) iterate("train", args, train_loader, model, optimizer, logger, epoch) # train for one epoch result, is_best = iterate("val", args, val_loader, model, None, logger, epoch) # evaluate on validation set
def main(args): """Function to predict for a single image or folder of images """ print(args.dataset_path) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") #download_model_if_doesnt_exist(args.model_path,args.model_name) model_path = Path(args.model_path) / args.model_name if not model_path.exists(): print(model_path + " does not exists") print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") #1 LOADING PRETRAINED MODEL #1.1 encoder print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() #1.2 decoder print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() #2. FINDING INPUT IMAGES dataset_path = Path(args.dataset_path) #files root = Path(os.path.dirname(__file__)) txt = root / 'splits' / args.split / args.txt_files print('-> inference file: ', txt) rel_paths = readlines(txt) #out if args.out_path != None: out_path = Path(args.out_path) else: out_path = Path('./' + dataset_path.stem + '_out') out_path.mkdir_p() files = [] #rel_paths 2 paths if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']: #kitti for item in rel_paths: item = item.split(' ') if item[2] == 'l': camera = 'image_02' elif item[2] == 'r': camera = 'image_01' files.append(dataset_path / item[0] / camera / 'data' / "{:010d}.png".format(int(item[1]))) elif args.split == 'mc': for item in rel_paths: #item = item.split('/') files.append(item) elif args.split == 'visdrone' or 'visdrone_lite': for item in rel_paths: item = item.split('/') files.append(dataset_path / item[0] / item[1] + '.jpg') else: for item in rel_paths: item = item.split('/') files.append(dataset_path / item[0] / item[1] + '.jpg') #2.1 cnt = 0 #3. PREDICTING ON EACH IMAGE IN TURN print('\n-> inference ' + args.dataset_path) files.sort() for image_path in tqdm(files): # Load image and preprocess if args.split == 'mc': input_image = pil.open(dataset_path / image_path + '.png').convert('RGB') else: input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) #torch.Size([1, 3, 192, 640]) features = encoder(input_image) #a list from 0 to 4 outputs = depth_decoder(features) # dict , 4 disptensor cnt += 1 disp = outputs[("disp", 0)] # has a same size with input disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file #if args.out_name=='num': if args.split == 'eigen' or args.split == 'custom': output_name = str(image_path).split('/')[-4] + '_{}'.format( image_path.stem) elif args.split == 'mc': block, p, color, frame = image_path.split('/') output_name = str(image_path).replace('/', '_') + '.png' elif args.split == 'visdrone' or args.split == 'visdrone_lite': output_name = image_path.relpath(dataset_path).strip( '.jpg').replace('/', '_') pass elif args.split == 'custom_mono': output_name = image_path.relpath(dataset_path).strip( '.jpg').replace('/', '_') else: output_name = image_path.relpath(dataset_path).strip( '.jpg').replace('/', '_') if args.npy_out: name_dest_npy = os.path.join(out_path, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) name_dest_im = Path(out_path) / "{}.png".format(output_name) plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax) print(cnt) print('\n-> Done,save at ' + args.out_path)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 selected_frame = 100 K = np.array([[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) sequence_id = 0 filenames = readlines(os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) dataset = datasets.KITTIOdomDataset( opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader( dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda() if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity(pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.eval_object: object_masks = [] for line in filenames: line = line.split() folder, frame_index = line[0], int(line[1]) object_mask_filename = os.path.join( os.path.dirname(__file__), "object_masks", folder, "{:010d}.npy".format(int(frame_index))) object_mask = np.load(object_mask_filename) object_masks.append(object_mask) if opt.save_pred_disps: output_path = os.path.join( opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print("-> No ground truth is available for the KITTI benchmark, so not evaluating. Done.") quit() gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths_odom_00.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] pred_poses = np.load('pred_poses_T.npy') norms_divs = np.load('gt_norms_div00.npy') scales_dgc = np.load('ratios_of_odom.npy') ''' gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv(np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ''' print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format(STEREO_SCALE_FACTOR)) opt.scaling = "disable" opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] ex_logs = [] mean_scale = [] side_map = {"2": 2, "3": 3, "l": 2, "r": 3} #resize_ori = transforms.Resize((pred_disps.shape[1],pred_disps.shape[2]),interpolation=Image.ANTIALIAS) for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] line = filenames[i].split() folder = line[0] frame_index = line[1] side = side_map[line[2]] color = pil_loader(get_image_path(folder,int(frame_index),side)) if i==selected_frame: color_grad = compute_grad(color) color_next = pil_loader(get_image_path(folder,int(frame_index)+1,side)) pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if opt.eval_split == "eigen": mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array( [0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) if opt.eval_object: object_mask = object_masks[i].astype(np.bool) else: mask = gt_depth > 0 if opt.scaling == "gt": ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask]) if opt.eval_object: mask = np.logical_and(mask, object_mask) elif opt.scaling == "dgc": scale_recovery = ScaleRecovery(1, gt_height, gt_width, K).cuda() #scale_recovery = ScaleRecovery(1, 192, 640, K).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio1,surface_normal1,ground_mask1,_,_,_,_ = scale_recovery(pred_depth) ratio = ratio1.cpu().item() surface_normal = surface_normal1.cpu()[0,:,:,:].numpy() ground_mask = ground_mask1.cpu()[0,0,:,:].numpy() pred_depth = pred_depth[0].cpu().numpy() else: ratio = 1 #print(ratio) #print(max(pred_depth)) #print(min(pred_depth)) if i==selected_frame: cords = find_cord(color_grad, mask) selected_points(cords, color, i) min_gt = gt_depth[cords[0][0]][cords[0][1]] max_gt = gt_depth[cords[1][0]][cords[1][1]] median_gt = gt_depth[cords[2][0]][cords[2][1]] print("min max median gt depths are", min_gt, max_gt, median_gt) to_tensor = transforms.ToTensor() color_tens = to_tensor(color) color_tens_next = to_tensor(color_next).unsqueeze(0) pred_pose = pred_poses[i] norms_div = norms_divs[i] scale_dgc = scales_dgc[i] pred_pose_tens = torch.from_numpy(pred_pose).unsqueeze(0).cuda() t_norm = np.linalg.norm(pred_pose[:3, 3]) print("gt_depth of min max median divided by norms of translation and scale of norm", min_gt/(t_norm*norms_div), max_gt/(t_norm*norms_div), median_gt/(t_norm*norms_div)) print("gt_depth of min max median divided by norms of translation and scale of dgc", min_gt/(t_norm*scale_dgc), max_gt/(t_norm*scale_dgc), median_gt/(t_norm*scale_dgc)) depth_tens = torch.from_numpy(pred_depth).unsqueeze(0).cuda() project_3d = Project3D(1, gt_height, gt_width).cuda() backproject_depth = BackprojectDepth(1, gt_height, gt_width).cuda() K_tens = torch.from_numpy(K).unsqueeze(0).cuda() inv_K = np.linalg.pinv(K) inv_K = torch.from_numpy(inv_K).unsqueeze(0).cuda() cam_points = backproject_depth(depth_tens, inv_K,torch.from_numpy(cords[2]).cuda()) pix_coords = np.array(project_3d(cam_points, K_tens, pred_pose_tens)) #print(pix_coords.shape) #pix_coords = pix_coords[0,:,:,:] l1_losses = [] ssim_losses = [] reprojection_losses = [] for pix_coord in pix_coords: pix_coord_tens = torch.from_numpy(pix_coord).unsqueeze(0) pred = F.grid_sample(color_tens_next, pix_coord_tens, padding_mode="border") l1_loss, ssim_loss, reprojection_loss = compute_reprojection_loss(pred, color_tens.unsqueeze(0),cords[2]) l1_losses.append(l1_loss) ssim_losses.append(ssim_loss) reprojection_losses.append(reprojection_loss) min_loss_pixel_index = np.argmin(reprojection_losses) visual_reprojection(color,cords[2],pix_coords[min_loss_pixel_index,cords[2,0],cords[2,1]],selected_frame) pred_depth_ori = pred_depth*mask gt_depth_ori = gt_depth*mask pred_depth_ori = np.where(mask==1,pred_depth_ori,1) pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] mean_scale.append(np.mean(gt_depth/pred_depth)) ''' mu = np.mean(div_values1) sigma = np.std(div_values1) #print(min(div_values1),max(div_values1)) fig,ax=plt.subplots() n, bins, patches = ax.hist(div_values1,150,range=(3,130),density = True) y = norm.pdf(bins, mu, 0.8*sigma) ax.plot(bins, y, 'r') plt.xlabel('Scale') plt.ylabel('Density') plt.savefig(os.path.join(os.path.dirname(__file__), "hist_imgs2","{:010d}.jpg".format(i))) plt.close() #blend_img = blending_imgs(div_scale, color,i) #blend_img.save(os.path.join(os.path.dirname(__file__), "blend_imgs","{:010d}.jpg".format(i))) blending_imgs(surface_normal,color,i,'surface_normals') blending_imgs(ground_mask,color,i,'ground_masks') ''' pred_depth *= ratio ratios.append(ratio) pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH #blending_imgs(div_scale, color,i,mask) if len(gt_depth) != 0: errors.append(compute_errors(gt_depth, pred_depth)) save_path = os.path.join(os.path.dirname(__file__), "l1_losses_{}.npy".format(selected_frame)) np.save(save_path, l1_losses) save_path = os.path.join(os.path.dirname(__file__), "ssim_losses_{}.npy".format(selected_frame)) np.save(save_path, ssim_losses) save_path = os.path.join(os.path.dirname(__file__), "reprojection_losses_{}.npy".format(selected_frame)) np.save(save_path, reprojection_losses) ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def main_with_masks(args): """Function to predict for a single image or folder of images """ print(args.dataset_path) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") out_path = Path(args.out_path) out_path.mkdir_p() dirs = {} for mask in args.results: dirs[mask] = (out_path / mask) (out_path / mask).mkdir_p() print('-> split:{}'.format(args.split)) print('-> save to {}'.format(args.out_path)) if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']: feed_height = 192 feed_width = 640 min_depth = 0.1 max_depth = 80 full_height = 375 full_width = 1242 dataset = KITTIRAWDataset elif args.split in ["visdrone", "visdrone_lite"]: feed_width = 352 feed_height = 192 min_depth = 0.1 max_depth = 255 dataset = VSDataset elif args.split in ['mc', 'mc_lite']: feed_height = 288 feed_width = 384 min_depth = 0.1 max_depth = 255 dataset = MCDataset feed_height = 192 feed_width = 640 backproject_depth = BackprojectDepth(1, feed_height, feed_width).to(device) project_3d = Project3D(1, feed_height, feed_width) photometric_error = PhotometricError() txt_files = args.txt_files #data test_path = Path(args.wk_root) / "splits" / args.split / txt_files test_filenames = readlines(test_path) if args.as_name_sort: #按照序列顺序名字排列 test_filenames.sort() #check filenames: i = 0 for i, item in enumerate(test_filenames): #item = test_filenames[i] if args.split in ['eigen', 'custom', 'custom_lite', 'eigen_zhou']: dirname, frame, lr = test_filenames[i].split() files = (Path(args.dataset_path) / dirname / 'image_02/data').files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' if args.split in ['mc', 'mc_lite']: #虽然在split的时候已经处理过了 block, trajactory, color, frame = test_filenames[i].split('/') files = (Path(args.dataset_path) / block / trajactory / color).files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' pass if args.split in ['visdrone', 'visdrone_lite']: #虽然在split的时候已经处理过了 dirname, frame = test_filenames[i].split('/') files = (Path(args.dataset_path) / dirname).files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' while '' in test_filenames: test_filenames.remove('') test_dataset = dataset( # KITTIRAWData args.dataset_path, test_filenames, feed_height, feed_width, args.frame_ids, 1, is_train=False, img_ext=args.ext) test_loader = DataLoader( # train_datasets:KITTIRAWDataset dataset=test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, drop_last=False) print('->items num: {}'.format(len(test_loader))) #layers #download_model_if_doesnt_exist(args.model_path,args.model_name) model_path = Path(args.model_path) / args.model_name if not model_path.exists(): print(model_path + " does not exists") print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") #1 LOADING PRETRAINED MODEL #1.1 encoder print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() #1.2 decoder print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() #paths pose_encoder_path = Path(model_path) / "pose_encoder.pth" pose_decoder_path = Path(model_path) / 'pose.pth' # 2.1 pose encoder print(" Loading pretrained pose encoder") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() # 2.2 pose decoder print(" Loading pretrained decoder") pose_decoder = networks.PoseDecoder(num_ch_enc=pose_encoder.num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) pose_loaded_dict = torch.load(pose_decoder_path, map_location=device) pose_decoder.load_state_dict(pose_loaded_dict) pose_decoder.to(device) pose_decoder.eval() source_scale = 0 scale = 0 for batch_idx, inputs in tqdm(enumerate(test_loader)): for key, ipt in inputs.items(): inputs[key] = ipt.to(device) features = encoder(inputs[("color", 0, 0)]) # a list from 0 to 4 outputs = depth_decoder(features) # dict , 4 disptensor disp = outputs[("disp", 0)] # has a same size with input #disp_resized = torch.nn.functional.interpolate(disp, (full_height, full_width), mode="bilinear", align_corners=False) _, depth = disp_to_depth(disp, min_depth, max_depth) for f_i in [args.frame_ids[0], args.frame_ids[-1]]: if f_i < 0: pose_inputs = [ inputs[("color", f_i, 0)], inputs[("color", 0, 0)] ] else: pose_inputs = [ inputs[("color", 0, 0)], inputs[("color", f_i, 0)] ] pose_inputs = torch.cat(pose_inputs, 1) features = pose_encoder(pose_inputs) axisangle, translation = pose_decoder([features]) outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters( axisangle[:, 0], translation[:, 0], invert=(f_i < 0)) # b44 T = outputs[("cam_T_cam", 0, f_i)] cam_points = backproject_depth(depth, inputs[("inv_K", 0)]) # D@K_inv pix_coords = project_3d(cam_points, inputs[("K", 0)], T) # K@D@K_inv outputs[("sample", f_i, 0)] = pix_coords # rigid_flow outputs[("color", f_i, 0)] = F.grid_sample(inputs[("color", f_i, 0)], outputs[("sample", f_i, 0)], padding_mode="border") # output"color" 就是i-warped # add a depth warp outputs[("color_identity", f_i, 0)] = inputs[("color", f_i, 0)] target = inputs[("color", 0, 0)] reprojection_losses = [] for frame_id in [args.frame_ids[0], args.frame_ids[-1]]: pred = outputs[("color", frame_id, 0)] reprojection_losses.append(photometric_error.run(pred, target)) reprojection_losses = torch.cat(reprojection_losses, 1) identity_reprojection_losses = [] for frame_id in [args.frame_ids[0], args.frame_ids[-1]]: pred = inputs[("color", frame_id, source_scale)] identity_reprojection_losses.append( photometric_error.run(pred, target)) identity_reprojection_losses = torch.cat(identity_reprojection_losses, 1) erro_maps = torch.cat( (identity_reprojection_losses, reprojection_losses), dim=1) # b4hw identical_mask = IdenticalMask(erro_maps) identical_mask = identical_mask[0].detach().cpu().numpy() save_name = test_filenames[batch_idx].replace('/', '_') save_name = save_name.replace('l', '') save_name = save_name.replace('r', '') save_name = save_name.replace(' ', '') if "identical_mask" in args.results: plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name), identical_mask) if "depth" in args.results: # Saving colormapped depth image disp_np = disp[0, 0].detach().cpu().numpy() vmax = np.percentile(disp_np, 95) plt.imsave(dirs['depth'] / "{}.png".format(save_name), disp_np, cmap='magma', vmax=vmax) if "mean_mask" in args.results: mean_mask = MeanMask(erro_maps) mean_mask = mean_mask[0].detach().cpu().numpy() plt.imsave(dirs['mean_mask'] / "{}.png".format(save_name), mean_mask, cmap='bone') if "identical_mask" in args.results: identical_mask = IdenticalMask(erro_maps) identical_mask = identical_mask[0].detach().cpu().numpy() plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name), identical_mask, cmap='bone') if "var_mask" in args.results: var_mask = VarMask(erro_maps) var_mask = var_mask[0].detach().cpu().numpy() plt.imsave(dirs["var_mask"] / "{}.png".format(save_name), var_mask, cmap='bone') if "final_mask" in args.results: identical_mask = IdenticalMask(erro_maps) mean_mask = MeanMask(erro_maps) var_mask = VarMask(erro_maps) final_mask = float8or(mean_mask * identical_mask, var_mask) final_mask = final_mask[0].detach().cpu().numpy() plt.imsave(dirs["final_mask"] / "{}.png".format(save_name), final_mask, cmap='bone')
def depth_Estimation(args): model_name = args.model_name #Setting up the network print("Loading model....") download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') depth_decoder.load_state_dict(loaded_dict) encoder.eval() depth_decoder.eval() #Loading image print("Loading image....") image_path = args.image_path input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] input_image_resized = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image_pytorch = transforms.ToTensor()(input_image_resized).unsqueeze( 0) input_npy = input_image_pytorch.squeeze().cpu().numpy() #prediction of disparity image with torch.no_grad(): features = encoder(input_image_pytorch) outputs = depth_decoder(features) disp = outputs[("disp", 0)] #Scaling for given resolution disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False ) # interpolate the values in to fit the given resolution of the image disp_resized_np = disp_resized.squeeze().cpu().numpy( ) # Converting tensor in pytorch to numpy array print("resized disp" + str(disp_resized_np.shape)) print("Range of Depth in image") scaled, dep = disp_to_depth( disp_resized_np, 0.1, 1000) # resizing the depth from 0.1 to 100 units print("min->" + str(dep.min()) + "mx->" + str(dep.max())) #Preview of the rgb and Depth images rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB) depth = dep.reshape((rgb.shape[0], rgb.shape[1]), order='C') plot(rgb, depth) return rgb, depth
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.STEREO_SCALE_FACTOR = 5.4 val_biases, val_ranges = get_disparity_class_range( self.opt.disparity_class_num, self.opt.min_depth, self.opt.max_depth, self.opt.batch_size, self.opt.height, self.opt.width) self.val_biases = val_biases self.val_ranges = val_ranges self.disparity_class_num = len(self.val_biases) self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.semanticCoeff = self.opt.semanticCoeff self.sfx = nn.Softmax(dim=1) assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, isSwitch=self.opt.switchMode == "on", num_depth_cat=self.disparity_class_num) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) self.set_dataset() self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.set_layers() self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( self.train_num, self.val_num)) self.save_opts()
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 height, width = 192, 640 assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) data_images = sorted(os.listdir(os.path.join(opt.data_path, 'image'))) config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" cfg.merge_from_file(config_file) cfg.freeze() normalize_transform = transforms.Normalize(mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD) to_bgr_transform = transforms.Lambda(lambda x: x * 255) transform_simvodis = transforms.Compose([ # transforms.ToPILImage(), transforms.Resize((height * 2, width * 2)), transforms.ToTensor(), to_bgr_transform, normalize_transform, ]) maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth" encoder = networks.ResnetEncoder(cfg, maskrcnn_path) depth_decoder = networks.DepthDecoder(scales=opt.scales) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps, depths_gt = [], [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) if 'RGBD' in opt.data_path: pairing = open(os.path.join(opt.data_path, 'association.txt')).readlines() pairing = [item.split()[1:4:2] for item in pairing] pairing = {item[0][4:]: item[1][6:] for item in pairing} with torch.no_grad(): for one_image in data_images: file_path_img = os.path.join(opt.data_path, 'image', one_image) img_mat = pil_loader(file_path_img) if '7Scenes' in file_path_img: img_mat = img_mat.crop((0, int( (480 - 192) / 2), 640, int((480 + 192) / 2))) depth_mat = cv2.imread( os.path.join(opt.data_path, 'depth', one_image.split('.')[0] + '.depth.png'), cv2.IMREAD_ANYDEPTH) depth_mat = depth_mat[int((480 - 192) / 2):int((480 + 192) / 2), :] mask = (depth_mat != 65535) depth_mat = (depth_mat * mask) / 1000 gt_depth = np.expand_dims(depth_mat, axis=0) depths_gt.append(gt_depth) elif 'Make3D' in file_path_img: img_mat = img_mat.crop((0, int( (2272 - 511) / 2), 1704, int((2272 + 511) / 2))) mat = scipy.io.loadmat( os.path.join( opt.data_path, "depth", "depth_sph_corr-{}.mat".format(one_image[4:-4]))) ratio = 4.4 depth_new_height = 55 / ratio gt_depth = mat["Position3DGrid"][:, :, 3][int( (55 - depth_new_height) / 2):int((55 + depth_new_height) / 2)] gt_depth = np.expand_dims(gt_depth, axis=0) depths_gt.append(gt_depth) elif 'RGBD' in file_path_img: img_mat = img_mat.crop((0, int( (480 - 192) / 2), 640, int((480 + 192) / 2))) depth_mat = cv2.imread( os.path.join(opt.data_path, 'depth', pairing[one_image]), cv2.IMREAD_ANYDEPTH) depth_mat = depth_mat[int((480 - 192) / 2):int((480 + 192) / 2), :] mask = (depth_mat != 65535) depth_mat = (depth_mat * mask) / 1000 gt_depth = np.expand_dims(depth_mat, axis=0) depths_gt.append(gt_depth) input_color = transform_simvodis(img_mat).cuda() input_color = input_color.unsqueeze(0) if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() print("-> Evaluating") print(" Mono evaluation - using median scaling") errors = [] ratios = [] gt_depths = np.concatenate(depths_gt) for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= opt.pred_depth_scale_factor if not opt.disable_median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH errors.append(compute_errors(gt_depth, pred_depth)) if not opt.disable_median_scaling: ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format(args.image_path)) camera_intrinsics_px = [1242*0.58, 375*1.92, 1242*0.5, 375*0.5] # See datasets/kitti_dataset.py # TODO: improve loading intrinsics from file ? print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpeg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image_original = pil.open(image_path).convert('RGB') original_width, original_height = input_image_original.size input_image = input_image_original.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, depth = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Save PLY pointcloud from depth map depth_resized = torch.nn.functional.interpolate( depth, (original_height, original_width), mode="nearest") # !! do not interpolate depth values depth_resized_np = depth_resized.cpu().numpy()[0][0] nbPts = 0 plypoints = "" for v in range(0, original_height): for u in range(0, original_width): d = depth_resized_np[v][u] if d <= 0.0: continue r,g,b = input_image_original.getpixel((u,v)) x = d * (float(u) - camera_intrinsics_px[2]) / camera_intrinsics_px[0] y = d * (float(v) - camera_intrinsics_px[3]) / camera_intrinsics_px[1] z = d * 1.0; nbPts += 1 plypoints += str(x) + " " + str(y) + " " + str(z) + " " + str(r) + " " + str(g) + " " + str(b) + "\n" plyhead = "ply\n" plyhead += "format ascii 1.0\n" plyhead += "element vertex " + str(nbPts) + "\n" plyhead += "property float x\n" plyhead += "property float y\n" plyhead += "property float z\n" plyhead += "property uchar red\n" plyhead += "property uchar green\n" plyhead += "property uchar blue\n" plyhead += "end_header\n" filePly = open(os.path.join(output_directory, "{}_disp.ply".format(output_name)), "w+") filePly.write(plyhead + plypoints + "\n") filePly.close() # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}".format( idx + 1, len(paths), name_dest_im)) print('-> Done!')
import torch from torchvision import transforms import networks from utils import download_model_if_doesnt_exist model_name = "mono_640x192" download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') depth_decoder.load_state_dict(loaded_dict) encoder.eval() depth_decoder.eval(); # image_path = "assets/006656.png" image_path = "../data_sample/000039.png" input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size
def get_pred_disps(opt, split, dataset_choice, tmp_dir_path, out_dir): if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) #filenames = readlines(os.path.join(splits_dir, opt.eval_split, "test_files.txt")) train_or_val = {"train": "train_files.txt", "val": "val_files.txt"} filenames = sorted( readlines( os.path.join( splits_dir, opt.eval_split, train_or_val[split]))) # sorted facilitates our life encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset, "carla": datasets.CarlaDataset, "waymo": datasets.WaymoDataset, "mixed": datasets.MixedDataset } dataset = datasets_dict[dataset_choice](opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False) # dataset = datasets.carla_dataset.CarlaDataset(opt.data_path, filenames, # encoder_dict['height'], encoder_dict['width'], # [0], 4, is_train=False) dataloader = DataLoader( dataset, 1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False ) # Changed batch from 16 to 1 (before was evaluating only total/16 it seems?) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for frame_idx, data in enumerate(dataloader): if frame_idx % 100 == 0: print( f"Creating disparity frame {frame_idx}/{len(dataloader)}" ) input_color = data[("color", 0, 0)].cuda() if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) #pred_disps.append(pred_disp) frame_name = filenames[frame_idx] output_path = os.path.join(tmp_dir_path, f"{frame_name}.npy") np.save(output_path, pred_disp) # pred_disps = np.concatenate(pred_disps) #output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) # print("-> Saving predicted disparities to ", output_path) print(f"Saved predicted disparities to temporary dir {tmp_dir_path}") disparity_files = [ os.path.join(tmp_dir_path, x) for x in os.listdir(tmp_dir_path) ] disparity_files = sorted(disparity_files) return disparity_files, filenames
def getMonoDepth(input_image): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") loc = baseLoc + 'monodepth2/' model_path = os.path.join(loc + "models", 'mono+stereo_640x192') encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() with torch.no_grad(): input_image = pil.fromarray(input_image) # input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) vmin = disp_resized_np.min() disp_resized_np = vmin + (disp_resized_np - vmin) * (vmax - vmin) / ( disp_resized_np.max() - vmin) disp_resized_np = (255 * (disp_resized_np - vmin) / (vmax - vmin)).astype(np.uint8) colormapped_im = cv2.applyColorMap(disp_resized_np, cv2.COLORMAP_HOT) colormapped_im = cv2.cvtColor(colormapped_im, cv2.COLOR_BGR2RGB) # normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) # mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') # colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) return colormapped_im
def test_simple(args): """Function to predict for a single image or folder of images""" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc["height"] feed_width = loaded_dict_enc["width"] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = (os.path.dirname(args.image_path) if not args.dump_path else args.dump_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, "*.{}".format(args.ext))) output_directory = args.image_path if not args.dump_path else args.dump_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): mse = 0 for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert("RGB") original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False, ) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) vmin = disp_resized_np.min() normalizer = mpl.colors.Normalize(vmin=vmin, vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap="magma") colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) # Calc error correct_file = re.sub(r"\.\w+", "_depth.npy", image_path) if os.path.exists(correct_file): correct = np.load(correct_file)[:, :, 0] disp_np = disp_resized.cpu().detach().numpy() disp_np = disp_np[0, 0, :, :] correct = ((correct - correct.min()) / (correct.max() - correct.min()) * 255) disp_np = ((disp_np - disp_np.min()) / (disp_np.max() - disp_np.min()) * 255) mse = mse + ((correct - disp_np)**2).mean()**0.5 / 255 print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) print(f"mse: {mse}") print("-> Done!")
def test_cam(args): if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # Extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("-> Loading complete, initializing the camera") # Initialize camera to capture image stream # Change the value to 0 when using default camera #video_stream = WebcamVideoStream(src=args.webcam).start() if not args.no_display: # Object to display images image_display = DisplayImage(not args.no_process) # Flag that records when 'q' is pressed to break out of inference loop below quit_inference = False def on_release(key): if key == keyboard.KeyCode.from_char('q'): nonlocal quit_inference quit_inference = True #s.close() return False keyboard.Listener(on_release=on_release).start() # Number of frames to capture to calculate fps num_frames = 5 curr_time = np.zeros(num_frames) with torch.no_grad(): print("Loop has started") host = "0.0.0.0" port = 5015 s = socket.socket() try: s.bind((host, port)) except socket.error as e: print(str(e)) print("Socket setup") connected = True bufferSize = 8192 #c, addr = s.accept() #print("Connected to :", addr[0], ":",addr[1]) first_loop = True connection_ready = False while True: if quit_inference: if args.no_display: print('-> Done') break if first_loop: frame = cv2.imread('assets/test_image.jpg') print("Read test image") first_loop = False elif not connection_ready: s.listen(10) c, addr = s.accept() print("Connected to: ", addr[0], ":", addr[1]) connection_ready = True continue else: try: data = c.recv(11) print("data as a string: " + str(data)) if (str(data).startswith('b\'SIZE')): tmp = str(data).split() bufferSize = int(tmp[1][:-1]) print("tmp[1] :" + str(tmp[1])) c.sendall("yes".encode()) data = bytearray(c.recv(bufferSize)) print(data) #else: # data = bytearray(data) + bytearray(c.recv(bufferSize)) #data = bytearray(c.recv(bufferSize)) print("Data") print(data) frame_np = np.asarray(data, dtype=np.uint8) print("frame_np") print(frame_np) frame = cv2.imdecode(frame_np, cv2.IMREAD_COLOR) print("frame") print(frame) # print(frame.shape) except socket.error as e: connected = False print("Connection lost, reconnecting") while not connected: try: c.bind(("0.0.0.0", port)) c.listen() c.accept() print("Reconnection worked") connected = True except socket.error as e: print(e) # Capture frame-by-frame #frame = video_stream.read() # frame = np.asarray(data, dtype =np.uint8) #PUT IN THE ACTUAL IMAGE RETRIEVAL HERE #print (type(frame)) # Calculate the fps print("Got frame") curr_time[1:] = curr_time[:-1] curr_time[0] = time.time() fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1]) # Our operations on the frame come here # input_image = pil.fromarray(frame).convert('RGB') #fh = open("testfile.jpg","wb") #fh.write(data) #fh.close() input_image = pil.fromarray(frame).convert('RGB') # img = pil.open(fh) # img.save(data, format ='jpg') # print("type: "+ type(img)) # input_image = pil.frombytes('RGB', len(data), data, 'raw') #input_image = pil.fromarray(data).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION print("Prediction starting") input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="nearest") # Get the predict depth scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100) pred_depth_np = pred_depth.squeeze().cpu().detach().numpy() # Initialize a 3x4 depth map depth_map = np.zeros([3, 4]) grid_width = original_width // 4 grid_height = original_height // 3 for i in range(len(depth_map)): for j in range(len(depth_map[0])): # Cut and store the average value of depth information of 640x480 into 3x4 grid depth_map[i][j] = get_avg_depth(pred_depth_np, grid_width * i, grid_height * j, grid_width * (i + 1), grid_height * (j + 1)) # Giving a simple decision logic if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[ 0, 2] <= 1 or depth_map[1, 2] <= 1: if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1: print("Dangerous!!! AHEAD") else: if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1: print("Dangerous!!! LEFT") if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1: print("Dangerous!!! RIGHT") elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum( depth_map[0:2, 2:3]) <= 7: if np.sum(depth_map[0:2, 0:1]) <= 7: print("Careful!! LEFT") if np.sum(depth_map[0:2, 2:3]) <= 7: print("Careful!! RIGHT") else: print("Clear") if not args.no_display: # DISPLAY # Generate color-mapped depth image disp_resized_np = disp_resized.squeeze().cpu().detach().numpy() image_display.display(frame, disp_resized_np, fps, original_width, original_height, blended=not args.no_blend) else: print(f"FPS: {fps}") # if quit_inference: # if args.no_display: # print('-> Done') # break # When everything is done, stop camera stream video_stream.stop()
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cpu") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) _, depth = disp_to_depth(disp_resized, 0.1, 100) depth_to_3d = BackprojectDepth(1, original_height, original_width) K = np.array([[[879.03824732, 0, 613.17597314, 0], [0, 879.03824732, 524.14407205, 0], [0, 0, 1, 0], [0, 0, 0, 1]]], dtype=np.float32) K[:2, :] = K[:2, :] / 4 inv_K = np.linalg.pinv(K) inv_K = torch.from_numpy(inv_K) pointclouds = depth_to_3d(depth, inv_K) points_to_TV = ProjectTV(1, original_width, original_height, original_width, original_height, 2) top_view = points_to_TV(pointclouds) print(top_view.shape) print(top_view) #print(np.nonzero(top_view)) import matplotlib.pyplot as plt plt.imshow(top_view[0].T) plt.savefig('foo1.png') #plt.show() #save_topview(top_view, 'tv_test') # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) print('-> Done!')
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, load_semantics=opt.load_semantics, seman_path=opt.seman_path) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) if opt.bnMorphLoss: from bnmorph.bnmorph import BNMorph bnmorph = BNMorph(height=encoder_dict['height'], width=encoder_dict['width']).cuda() if opt.post_process: tool = grad_computation_tools( batch_size=opt.batch_size * 2, height=encoder_dict['height'], width=encoder_dict['width']).cuda() else: tool = grad_computation_tools( batch_size=opt.batch_size, height=encoder_dict['height'], width=encoder_dict['width']).cuda() model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] count = 0 with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda() if opt.post_process: input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) if 'seman_gt' in data: data['seman_gt'] = torch.cat( (data['seman_gt'], torch.flip( data['seman_gt'], [3])), 0) features = encoder(input_color) outputs = dict() outputs.update(depth_decoder(features)) if opt.bnMorphLoss: for key, ipt in data.items(): if not (key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta' or key == 'file_add'): data[key] = ipt.to(torch.device("cuda")) disparity_grad_bin = tool.get_disparityEdge(outputs['disp', 0]) semantics_grad_bin = tool.get_semanticsEdge( data['seman_gt']) morphedx, morphedy, coeff = bnmorph.find_corresponding_pts( disparity_grad_bin, semantics_grad_bin) morphedx = (morphedx / (encoder_dict['width'] - 1) - 0.5) * 2 morphedy = (morphedy / (encoder_dict['height'] - 1) - 0.5) * 2 grid = torch.cat([morphedx, morphedy], dim=1).permute(0, 2, 3, 1) dispMaps_morphed = F.grid_sample(outputs['disp', 0], grid, padding_mode="border") outputs[("disp", 0)] = dispMaps_morphed count = count + 1 pred_disp, _ = disp_to_depth(outputs[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print( "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done." ) quit() gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format( STEREO_SCALE_FACTOR)) opt.disable_median_scaling = True opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if opt.eval_split == "eigen" or opt.UseCustTest: mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array([ 0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) else: mask = gt_depth > 0 pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= opt.pred_depth_scale_factor if not opt.disable_median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH errors.append( compute_errors( gt_depth, pred_depth, UseGtMedianScaling=(opt.UseGtMedianScaling == True))) if not opt.disable_median_scaling: ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size print("original width: ", original_width, " original height: ", original_height) input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) print(type(outputs)) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() theList = list(disp_resized_np) count = 0 # n = width, k = height # n lists of size k # origin is at bottom left labels, pts = get_clusters(theList) ptsOfInterest = [] backgroundPts = [] index = 0 for label in labels: if label == 1: backgroundPts.append(pts[index]) else: ptsOfInterest.append(pts[index]) index += 1 total = 0 count = 0 for pt in ptsOfInterest: count += 1 total += pt label0Distance = total / count total = 0 count = 0 for pt in backgroundPts: count += 1 total += pt label1Distance = total / count print("Distance to object:", label0Distance * scalingFactor, "meters") print("Distance to background:", label1Distance * scalingFactor, "meters") vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) img_ext = '.png' if opt.png else '.jpg' dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, img_ext=img_ext) dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda() if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.eval_object: object_masks = [] for line in filenames: line = line.split() folder, frame_index = line[0], int(line[1]) object_mask_filename = os.path.join( os.path.dirname(__file__), "object_masks", folder, "{:010d}.npy".format(int(frame_index))) object_mask = np.load(object_mask_filename) object_masks.append(object_mask) if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print( "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done." ) quit() gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format( STEREO_SCALE_FACTOR)) opt.scaling = "disable" opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if opt.eval_split == "eigen": mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array([ 0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) if opt.eval_object: object_mask = object_masks[i].astype(np.bool) else: mask = gt_depth > 0 if opt.scaling == "gt": ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask]) if opt.eval_object: mask = np.logical_and(mask, object_mask) elif opt.scaling == "dgc": tensor_K = K.copy() tensor_K[0, :] *= gt_width tensor_K[1, :] *= gt_height tensor_K = torch.from_numpy(tensor_K).unsqueeze(0).cuda() cam_height = torch.tensor([opt.cam_height]).cuda() scale_recovery = ScaleRecovery(1, gt_height, gt_width).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio = scale_recovery(pred_depth, tensor_K, cam_height).cpu().item() pred_depth = pred_depth[0].cpu().numpy() else: ratio = 1 pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= ratio ratios.append(ratio) pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH if len(gt_depth) != 0: errors.append(compute_errors(gt_depth, pred_depth)) ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 # 默认大小为640×192 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") # "scales used in the loss" self.num_scales = len(self.opt.scales) # 默认[0, -1, 1], target 对应id为0 self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") # self.opt.num_layers为encoder部分resnet的深度,默认使用ResNet-18 # 输出5个尺度的features self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) # 三种posenet的处理办法,在论文中的Supplementary Material的Table中有对比结果, # 从表中的结果来看,separate_resnet效果最好,默认选取separate_resnet if self.use_pose_net: # 和depth encoder不共享参数 # pose encoder部分将两张图像在通道维度堆叠为6个通道,输出一个features # pose decoder部分输入一个features,输出两个pose if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) # 和depth encoder共享参数 # encoder部分分别输入一张图像(类似孪生网络) # decoder部分输入两个features,输出一个pose elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) # posecnn为 Learning Depth from Monocular Videos using Direct Methods 中提出的方法, # 参考https://arxiv.org/pdf/1712.00175.pdf elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) # 这个mask对应的是sfmlearner的mask if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) # if set, disables ssim in the loss if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) # save options self.save_opts()
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset, "kitti_depth": datasets.KITTIDepthDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files_p.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def test_simple(model_name, paths, val_iter_list, batch_it_num, backproject_depth_l, project_3d_l, sv_path_l): """Function to predict for a single image or folder of images """ device = torch.device("cuda") model_path = model_name print("-> Loading model from ", model_path) encoder_path = os.path.join(paths, model_path, "encoder.pth") depth_decoder_path = os.path.join(paths, model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with # feed_height = loaded_dict_enc['height'] # feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("-> Predicting on test images") # PREDICTING ON EACH IMAGE IN TURN disp_resized_np_list = list() source_scale = 0 with torch.no_grad(): for count, val_iter in enumerate(val_iter_list): backproject_depth = backproject_depth_l[count] project_3d = project_3d_l[count] svcount = 0 for k in range(batch_it_num[count]): try: inputs = val_iter.next() except StopIteration: print("Finish iterating all available data") break T = inputs["stereo_T"].cuda() input_rgb = inputs[('color', 0, 0)].cuda() sample_rgb = inputs[('color', 's', 0)].cuda() features = encoder(input_rgb) outputs = depth_decoder(features) disp = outputs[("disp", 0)] _, depth = disp_to_depth(disp, 0.1, 100) cam_points = backproject_depth( depth, inputs[("inv_K", source_scale)].cuda()) pix_coords = project_3d(cam_points, inputs[("K", source_scale)].cuda(), T) reconstructed_rgb = F.grid_sample(sample_rgb, pix_coords, padding_mode="border") reconstructed_rgb = reconstructed_rgb.permute(0, 2, 3, 1).cpu() for picind in range(reconstructed_rgb.shape[0]): c_sv_path = os.path.join(sv_path_l[count], str(svcount) + ".png") img1 = inputs[('color', 's', 0)].permute(0, 2, 3, 1)[picind, :, :, :].numpy() img2 = reconstructed_rgb[picind, :, :, :].numpy() img3 = inputs[('color', 0, 0)].permute(0, 2, 3, 1)[picind, :, :, :].numpy() combined_img = np.concatenate((img1, img2, img3), axis=0) Image.fromarray( (combined_img * 255).astype(np.uint8)).save(c_sv_path) svcount = svcount + 1 print("finish %dth dataset %dth batch" % (count, k))
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") if args.pred_metric_depth and "stereo" not in args.model_name: print( "Warning: The --pred_metric_depth flag only makes sense for stereo-trained KITTI " "models. For mono-trained models, output depths will not in metric space." ) download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] scaled_disp, depth = disp_to_depth(disp, 0.1, 100) if args.pred_metric_depth: name_dest_npy = os.path.join( output_directory, "{}_depth.npy".format(output_name)) metric_depth = STEREO_SCALE_FACTOR * depth.cpu().numpy() np.save(name_dest_npy, metric_depth) else: name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved predictions to:". format(idx + 1, len(paths))) print(" - {}".format(name_dest_im)) print(" - {}".format(name_dest_npy)) print('-> Done!')
def __init__(self, options): self.opt = options self.seed_everything() # create dirs for logs and predictions if do not exist self.log_path = self.opt.log_dir if not os.path.exists(self.log_path): os.mkdir(self.log_path) preds_dir = os.path.join(self.log_path, "preds") if not os.path.exists(preds_dir): os.mkdir(preds_dir) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" # we don't expect anyone running this on cpu.. self.device = torch.device("cuda") # model initialization self.models = {} self.parameters_to_train = [] self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, True) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, True, num_input_images=self.num_input_frames) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) for _, m in self.models.items(): m.to(self.device) self.parameters_to_train += list(m.parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = BackprojectDepth( self.opt.batch_size * self.num_scales, self.opt.height, self.opt.width) self.backproject_depth.to(self.device) self.project_3d = Project3D( self.opt.batch_size * (self.num_input_frames - 1) * self.num_scales, self.opt.height, self.opt.width) self.project_3d.to(self.device) # save adaptation parameters to the log dir self.save_opts()