def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter." if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") print("-> Loading model from ", args.model_name) encoder_path = os.path.join(args.model_name, "encoder.pth") depth_decoder_path = os.path.join(args.model_name, "depth.pth") # LOADING PRETRAINED MODEL if args.no_ddv: encoder = networks.get_resnet101_asp_oc_dsn(2048, args.no_self_attention, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) else: encoder = networks.get_resnet101_asp_oc_dsn(128, args.no_self_attention, False) depth_decoder = networks.MSDepthDecoder( encoder.num_ch_enc, discretization=args.discretization) print(" Loading pretrained encoder") loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) print_size_of_model(encoder) print_size_of_model(depth_decoder) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN timings = list() with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) st = time.time() features = encoder(input_image) if args.no_ddv: outputs = depth_decoder(features) else: all_features = {} all_features['conv3'] = features[0] all_features['layer1'] = features[1] all_features['output'] = features[-1] outputs = depth_decoder(all_features) et = time.time() print('Elapsed time = {:0.4f} ms'.format((et - st) * 1000)) timings.append((et - st) * 1000) # disp = outputs[("disp", 0)] disp = features[-1] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='blues') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_attn.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) # print('Mean time elapsed: {:0.4f}'.format(np.mean(timings[11:]))) # print('Std time elapsed: {:0.4f}'.format(np.std(timings[11:]))) print('-> Done!')
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" if opt.ext_disp_to_eval is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) img_ext = '.png' if opt.png else '.jpg' dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, img_ext=img_ext) dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) if opt.no_ddv: encoder = networks.get_resnet101_asp_oc_dsn( 2048, opt.no_self_attention, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) else: encoder = networks.get_resnet101_asp_oc_dsn( 128, opt.no_self_attention, False) depth_decoder = networks.MSDepthDecoder( encoder.num_ch_enc, discretization=opt.discretization) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in dataloader: input_color = data[("color", 0, 0)].cuda() if opt.post_process: # Post-processed results require each image to have two forward passes input_color = torch.cat( (input_color, torch.flip(input_color, [3])), 0) features = encoder(input_color) if opt.no_ddv: output = depth_decoder(features) else: all_features = {} all_features['conv3'] = features[0] all_features['layer1'] = features[1] all_features['output'] = features[-1] output = depth_decoder(all_features) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() if opt.post_process: N = pred_disp.shape[0] // 2 pred_disp = batch_post_process_disparity( pred_disp[:N], pred_disp[N:, :, ::-1]) pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) else: # Load predictions from file print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) pred_disps = np.load(opt.ext_disp_to_eval) if opt.eval_eigen_to_benchmark: eigen_to_benchmark_ids = np.load( os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) pred_disps = pred_disps[eigen_to_benchmark_ids] if opt.save_pred_disps: output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) print("-> Saving predicted disparities to ", output_path) np.save(output_path, pred_disps) if opt.no_eval: print("-> Evaluation disabled. Done.") quit() elif opt.eval_split == 'benchmark': save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") print("-> Saving out benchmark predictions to {}".format(save_dir)) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx in range(len(pred_disps)): disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) depth = STEREO_SCALE_FACTOR / disp_resized depth = np.clip(depth, 0, 80) depth = np.uint16(depth * 256) save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) cv2.imwrite(save_path, depth) print( "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done." ) quit() gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] print("-> Evaluating") if opt.eval_stereo: print(" Stereo evaluation - " "disabling median scaling, scaling by {}".format( STEREO_SCALE_FACTOR)) opt.disable_median_scaling = True opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR else: print(" Mono evaluation - using median scaling") errors = [] ratios = [] for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp if opt.eval_split == "eigen": mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array([ 0.40810811 * gt_height, 0.99189189 * gt_height, 0.03594771 * gt_width, 0.96405229 * gt_width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) else: mask = gt_depth > 0 pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= opt.pred_depth_scale_factor if not opt.disable_median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH errors.append(compute_errors(gt_depth, pred_depth)) if not opt.disable_median_scaling: ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") if self.opt.no_ddv: self.models["encoder"] = networks.get_resnet101_asp_oc_dsn( 2048, self.opt.no_self_attention, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list( self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) else: self.models["encoder"] = networks.get_resnet101_asp_oc_dsn( 128, self.opt.no_self_attention, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list( self.models["encoder"].parameters()) self.models["depth"] = networks.MSDepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, discretization=self.opt.discretization) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 0.001 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) # Evaluate on cityscapes dataset CITY_DIR = opt.data_path cityscapes = datasets.CityscapesData(CITY_DIR) cityscapes_dataloader = DataLoader(cityscapes, 4, shuffle=False, num_workers=6, pin_memory=True, drop_last=False) encoder = networks.get_resnet101_asp_oc_dsn(128, opt.no_self_attention, False) depth_decoder = networks.MSDepthDecoder(encoder.num_ch_enc, discretization=opt.discretization) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_disps = [] print("-> Computing predictions with size {}x{}".format( encoder_dict['width'], encoder_dict['height'])) with torch.no_grad(): for data in cityscapes_dataloader: input_color = data.permute(0, 3, 1, 2) input_color = input_color.cuda() output = depth_decoder(encoder(input_color)) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) pred_disps = np.concatenate(pred_disps) gt_path = os.path.join(splits_dir, "gt_depths_cityscapes.npz") gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"] print("-> Evaluating") print(" Mono evaluation - using median scaling") errors = [] ratios = [] for i in range(pred_disps.shape[0]): gt_depth = gt_depths[i] gt_depth[gt_depth > 0] = (gt_depth[gt_depth > 0] - 1) / 256 gt_depth[gt_depth > 0] = (0.209313 * 2262.52) / gt_depth[gt_depth > 0] gt_depth[gt_depth > MAX_DEPTH] = 0 gt_height, gt_width = gt_depth.shape[:2] pred_disp = pred_disps[i] pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) pred_depth = 1 / pred_disp mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) crop = np.array([ 0.05 * gt_height, 0.80 * gt_height, 0.05 * gt_width, 0.99 * gt_width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 mask = np.logical_and(mask, crop_mask) pred_depth = pred_depth[mask] gt_depth = gt_depth[mask] pred_depth *= opt.pred_depth_scale_factor if not opt.disable_median_scaling: ratio = np.median(gt_depth) / np.median(pred_depth) ratios.append(ratio) pred_depth *= ratio pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH errors.append(compute_errors(gt_depth, pred_depth)) if not opt.disable_median_scaling: ratios = np.array(ratios) med = np.median(ratios) print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format( med, np.std(ratios / med))) mean_errors = np.array(errors).mean(0) print("\n " + ("{:>8} | " * 7 ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") print("\n-> Done!")