class DepthPredictor: def __init__(self, model_type, model_path, optimize): print("initialize") # select device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: %s" % self.device) # load network if model_type == "large": self.model = MidasNet(model_path, non_negative=True) self.net_w, self.net_h = 384, 384 elif model_type == "small": self.model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) self.net_w, self.net_h = 256, 256 else: print(f"model_type '{model_type}' not implemented, use: --model_type large") assert False self.transform = Compose( [ Resize( self.net_w, self.net_h, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), ] ) self.model.eval() self.optimize = optimize if self.optimize: rand_example = torch.rand(1, 3, self.net_h, self.net_w) self.model(rand_example) traced_script_module = torch.jit.trace(self.model, rand_example) self.model = traced_script_module if self.device == torch.device("cuda"): self.model = self.model.to(memory_format=torch.channels_last) self.model = self.model.half() self.model.to(self.device) def process_video(self, filename, dir_name): cap = cv2.VideoCapture(filename) fps = cap.get(cv2.CAP_PROP_FPS) count = 0 while cap.isOpened(): ret, frame = cap.read() if ret: if len(frame.shape) == 2: img = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR) img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) / 255.0 prediction = self.process_images(img) # output out_filename = os.path.join( dir_name, str(count / 30)[0] ) utils.write_depth(out_filename, prediction, bits=2) count += fps cap.set(1, count) else: cap.release() break def process_images(self, img): img_input = self.transform({"image": img})["image"] # compute with torch.no_grad(): sample = torch.from_numpy(img_input).to(self.device).unsqueeze(0) if self.optimize and self.device == torch.device("cuda"): sample = sample.to(memory_format=torch.channels_last) sample = sample.half() prediction = self.model.forward(sample) prediction = ( torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img.shape[:2], mode="bicubic", align_corners=False, ) .squeeze() .cpu() .numpy() ) return prediction
class InferenceEngine: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" # Setup AdaBins model self.adabins_nyu_infer_helper = InferenceHelper(dataset='nyu', device=self.device) self.adabins_kitti_infer_helper = InferenceHelper(dataset='kitti', device=self.device) # Setup DiverseDepth model class DiverseDepthArgs: def __init__(self): self.resume = False self.cfg_file = "lib/configs/resnext50_32x4d_diversedepth_regression_vircam" self.load_ckpt = "pretrained/DiverseDepth.pth" diverse_depth_args = DiverseDepthArgs() merge_cfg_from_file(diverse_depth_args) self.diverse_depth_model = RelDepthModel() self.diverse_depth_model.eval() # load checkpoint load_ckpt(diverse_depth_args, self.diverse_depth_model) # TODO: update this - see how `device` argument should be processsed if self.device == "cuda": self.diverse_depth_model.cuda() self.diverse_depth_model = torch.nn.DataParallel( self.diverse_depth_model) # Setup MiDaS model self.midas_model_path = "./pretrained/MiDaS_f6b98070.pt" midas_model_type = "large" # load network if midas_model_type == "large": self.midas_model = MidasNet(self.midas_model_path, non_negative=True) self.midas_net_w, self.midas_net_h = 384, 384 elif midas_model_type == "small": self.midas_model = MidasNet_small(self.midas_model_path, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) self.midas_net_w, self.midas_net_h = 256, 256 self.midas_transform = Compose([ Resize( self.midas_net_w, self.midas_net_h, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), ]) self.midas_model.eval() self.midas_optimize = True if self.midas_optimize == True: rand_example = torch.rand(1, 3, self.midas_net_h, self.midas_net_w) self.midas_model(rand_example) traced_script_module = torch.jit.trace(self.midas_model, rand_example) self.midas_model = traced_script_module if self.device == "cuda": self.midas_model = self.midas_model.to( memory_format=torch.channels_last) self.midas_model = self.midas_model.half() self.midas_model.to(torch.device(self.device)) # Setup SGDepth model self.sgdepth_model = InferenceEngine.SgDepthInference() # Setup monodepth2 model self.monodepth2_model_path = "pretrained/monodepth2_mono+stereo_640x192" monodepth2_device = torch.device(self.device) encoder_path = os.path.join(self.monodepth2_model_path, "encoder.pth") depth_decoder_path = os.path.join(self.monodepth2_model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading Monodepth2 pretrained encoder") self.monodepth2_encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=monodepth2_device) # extract the height and width of image that this model was trained with self.feed_height = loaded_dict_enc['height'] self.feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in self.monodepth2_encoder.state_dict() } self.monodepth2_encoder.load_state_dict(filtered_dict_enc) self.monodepth2_encoder.to(monodepth2_device) self.monodepth2_encoder.eval() print(" Loading pretrained decoder") self.monodepth2_depth_decoder = networks.DepthDecoder( num_ch_enc=self.monodepth2_encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=monodepth2_device) self.monodepth2_depth_decoder.load_state_dict(loaded_dict) self.monodepth2_depth_decoder.to(monodepth2_device) self.monodepth2_depth_decoder.eval() def adabins_nyu_predict(self, image): _, predicted_depth = self.adabins_nyu_infer_helper.predict_pil(image) predicted_depth = predicted_depth.squeeze() predicted_depth = cv2.resize(predicted_depth, (image.width, image.height)) return predicted_depth def adabins_kitti_predict(self, image): _, predicted_depth = self.adabins_kitti_infer_helper.predict_pil(image) predicted_depth = predicted_depth.squeeze() predicted_depth = cv2.resize(predicted_depth, (image.width, image.height)) return predicted_depth def diverse_depth_predict(self, image): img_torch = scale_torch(image, 255) img_torch = img_torch[np.newaxis, :] predicted_depth, _ = self.diverse_depth_model.module.depth_model( img_torch) predicted_depth = predicted_depth.detach().cpu().numpy() predicted_depth = predicted_depth.squeeze() return predicted_depth def midas_predict(self, image_path): """Run MonoDepthNN to compute depth maps. Args: image_path (str): path to input image """ def read_image(path): """Read image and output RGB image (0-1). Args: path (str): path to file Returns: array: RGB image (0-1) """ img = cv2.imread(path) if img.ndim == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 return img # get input img = read_image(image_path) img_input = self.midas_transform({"image": img})["image"] # compute with torch.no_grad(): sample = torch.from_numpy(img_input).to(torch.device( self.device)).unsqueeze(0) if self.midas_optimize == True and self.device == "cuda": sample = sample.to(memory_format=torch.channels_last) sample = sample.half() prediction = self.midas_model.forward(sample) prediction = (torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img.shape[:2], mode="bicubic", align_corners=False, ).squeeze().cpu().numpy()) depth_min = prediction.min() depth_max = prediction.max() prediction = (prediction - depth_min) / (depth_max - depth_min) # prediction *= 10 return prediction class SgDepthInference: """Inference without harness or dataloader""" def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model_path = "./pretrained/SGDepth_full.pth" self.num_classes = 20 self.depth_min = 0.1 self.depth_max = 100 self.all_time = [] self.labels = ( ('CLS_ROAD', (128, 64, 128)), ('CLS_SIDEWALK', (244, 35, 232)), ('CLS_BUILDING', (70, 70, 70)), ('CLS_WALL', (102, 102, 156)), ('CLS_FENCE', (190, 153, 153)), ('CLS_POLE', (153, 153, 153)), ('CLS_TRLIGHT', (250, 170, 30)), ('CLS_TRSIGN', (220, 220, 0)), ('CLS_VEGT', (107, 142, 35)), ('CLS_TERR', (152, 251, 152)), ('CLS_SKY', (70, 130, 180)), ('CLS_PERSON', (220, 20, 60)), ('CLS_RIDER', (255, 0, 0)), ('CLS_CAR', (0, 0, 142)), ('CLS_TRUCK', (0, 0, 70)), ('CLS_BUS', (0, 60, 100)), ('CLS_TRAIN', (0, 80, 100)), ('CLS_MCYCLE', (0, 0, 230)), ('CLS_BCYCLE', (119, 11, 32)), ) self.init_model() def init_model(self): sgdepth = SGDepth with torch.no_grad(): # init 'empty' model self.model = sgdepth( 1, # opt.model_split_pos 18, # opt.model_num_layers 0.9, # opt.train_depth_grad_scale 0.1, # opt.train_segmentation_grad_scale 'pretrained', # opt.train_weights_init 4, # opt.model_depth_resolutions 18, # opt.model_num_layers_pose ) # load weights (copied from state manager) state = self.model.state_dict() to_load = torch.load(self.model_path) for (k, v) in to_load.items(): if k not in state: print( f" - WARNING: Model file contains unknown key {k} ({list(v.shape)})" ) for (k, v) in state.items(): if k not in to_load: print( f" - WARNING: Model file does not contain key {k} ({list(v.shape)})" ) else: state[k] = to_load[k] self.model.load_state_dict(state) self.model = self.model.eval() if self.device == "cuda": self.model.cuda() def load_image(self, image): self.image = image self.image_o_width, self.image_o_height = self.image.size resize = transforms.Resize((192, 640)) image = resize(self.image) # resize to argument size #center_crop = transforms.CenterCrop((opt.inference_crop_height, opt.inference_crop_width)) #image = center_crop(image) # crop to input size to_tensor = transforms.ToTensor() # transform to tensor self.input_image = to_tensor( image ) # save tensor image to self.input_image for saving later image = self.normalize(self.input_image) image = image.unsqueeze(0).float() if self.device == "cuda": image = image.cuda() # simulate structure of batch: image_dict = {('color_aug', 0, 0): image} # dict image_dict[('color', 0, 0)] = image image_dict['domain'] = [ 'cityscapes_val_seg', ] image_dict['purposes'] = [[ 'segmentation', ], [ 'depth', ]] image_dict['num_classes'] = torch.tensor([self.num_classes]) image_dict['domain_idx'] = torch.tensor(0) self.batch = (image_dict, ) # batch tuple def normalize(self, tensor): mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) normalize = transforms.Normalize(mean, std) tensor = normalize(tensor) return tensor def get_depth_meters(self, image): # load image and transform it in necessary batch format self.load_image(image) start = time.time() with torch.no_grad(): output = self.model(self.batch) # forward pictures self.all_time.append(time.time() - start) start = 0 disps_pred = output[0]["disp", 0] # depth results segs_pred = output[0]['segmentation_logits', 0] # seg results segs_pred = segs_pred.exp().cpu() segs_pred = segs_pred.numpy() # transform preds to np array segs_pred = segs_pred.argmax( 1) # get the highest score for classes per pixel depth_pred = disps_pred depth_pred = np.array( depth_pred[0][0].cpu()) # depth predictions to numpy and CPU def scale_depth(disp): min_disp = 1 / self.depth_max max_disp = 1 / self.depth_min return min_disp + (max_disp - min_disp) * disp depth_pred = scale_depth(depth_pred) # Depthmap in meters return depth_pred def sgdepth_predict(self, image): return self.sgdepth_model.get_depth_meters(image) def monodepth2_predict(self, input_image): original_width, original_height = input_image.size input_image = input_image.resize((self.feed_width, self.feed_height), Image.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) device = torch.device(self.device) input_image = input_image.to(device) features = self.monodepth2_encoder(input_image) outputs = self.monodepth2_depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) _, predicted_depth = disp_to_depth(disp, 0.1, 10) predicted_depth = predicted_depth.detach().cpu().numpy() predicted_depth = predicted_depth.squeeze() predicted_depth = cv2.resize(predicted_depth, (original_width, original_height)) return predicted_depth def predict_depth(self, path): image = Image.open(path) original = cv2.imread(path) # Predict with AdaBins NYU pre-trained model adabins_nyu_prediction = self.adabins_nyu_predict(image) adabins_nyu_max = np.max(adabins_nyu_prediction) result_shape = adabins_nyu_prediction.shape # Predict with AdaBins KITTI pre-trained model adabins_kitti_prediction = self.adabins_kitti_predict(image) adabins_kitti_max = np.max(adabins_kitti_prediction) # Predict with DiverseDepth model diverse_depth_prediction = self.diverse_depth_predict(image) # Predict with MiDaS model midas_depth_prediction = self.midas_predict(path) midas_depth_prediction = (midas_depth_prediction - np.max(midas_depth_prediction)) * -1 # Predict with SGDepth sgdepth_depth_prediction = self.sgdepth_predict(image) sgdepth_depth_prediction = 1 / sgdepth_depth_prediction sgdepth_depth_prediction = cv2.resize( sgdepth_depth_prediction, (result_shape[1], result_shape[0])) # Predict with monodepth2 monodepth2_depth_prediction = self.monodepth2_predict(image) def print_min_max(label, d): print(label, "[" + str(np.min(d)) + ", " + str(np.max(d)) + "]") if adabins_nyu_max <= 6: # ~19ft # Maybe indoor scene diverse_depth_prediction *= (adabins_nyu_max / np.max(diverse_depth_prediction)) midas_depth_prediction *= (adabins_nyu_max / np.max(midas_depth_prediction)) average_depth = ( adabins_nyu_prediction + diverse_depth_prediction + midas_depth_prediction * 5 + sgdepth_depth_prediction + monodepth2_depth_prediction) / 9 else: # Maybe outdoor scene diverse_depth_prediction *= (adabins_kitti_max / np.max(diverse_depth_prediction)) midas_depth_prediction *= (adabins_nyu_max / np.max(midas_depth_prediction)) average_depth = ( diverse_depth_prediction + midas_depth_prediction * 5 + sgdepth_depth_prediction + monodepth2_depth_prediction) / 8 # print_min_max("Adabins NYU", adabins_nyu_prediction) # print_min_max("Adabins KITTI", adabins_kitti_prediction) # print_min_max("DiverseDepth", diverse_depth_prediction) # print_min_max("MiDaS", midas_depth_prediction) # print_min_max("SGDepth", sgdepth_depth_prediction) # print_min_max("Monodepth2", monodepth2_depth_prediction) # print_min_max("Average", average_depth) # print("---------------------------------------") return original, average_depth, colorize_depth(average_depth, 0, 20)