Exemple #1
0
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    K = np.array(
        [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10" or opt.eval_split == "odom_0", \
        "eval_split should be either odom_9 or odom_10"

    sequence_id = int(opt.eval_split.split("_")[1])

    filenames = readlines(
        os.path.join(os.path.dirname(__file__), "splits", "odom",
                     "test_files_{:02d}.txt".format(sequence_id)))

    dataset = KITTIOdomDataset(opt.data_path,
                               filenames,
                               opt.height,
                               opt.width, [0, 1],
                               4,
                               is_train=False)
    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")
    depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))
    depth_encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_encoder_dict = torch.load(depth_encoder_path)
    model_dict = depth_encoder.state_dict()
    depth_encoder.load_state_dict(
        {k: v
         for k, v in depth_encoder_dict.items() if k in model_dict})

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))
    depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc)
    depth_decoder.load_state_dict(torch.load(depth_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()
    depth_encoder.cuda()
    depth_encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    pred_poses = []
    pred_disps = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    with torch.no_grad():
        for inputs in dataloader:
            input_color = inputs[("color", 0, 0)].cuda()
            depth_output = depth_decoder(depth_encoder(input_color))

            pred_disp, _ = disp_to_depth(depth_output[("disp", 0)],
                                         opt.min_depth, opt.max_depth)
            pred_disp = pred_disp.cpu()[:, 0].numpy()

            pred_disps.append(pred_disp)

            for key, ipt in inputs.items():
                inputs[key] = ipt.cuda()

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)

            pred_poses.append(
                transformation_from_parameters(axisangle[:, 0],
                                               translation[:,
                                                           0]).cpu().numpy())

    pred_poses = np.concatenate(pred_poses)
    pred_disps = np.concatenate(pred_disps)
    pred_poses_scaled = []
    ratios_d = []
    gt_norms_div = []
    gt_norms = []
    pred_norms = []
    td_divs_dgc = []
    poses_pred = []
    for i in range(pred_poses.shape[0]):
        pred_pose = pred_poses[i]
        pred_disp = pred_disps[i + 1]
        pred_depth = 1 / pred_disp
        scale_recovery = ScaleRecovery(1, 192, 640, K).cuda()
        pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
        ratio = scale_recovery(pred_depth).cpu().item()
        pred_pose_scaled = pred_pose[:3, 3] * ratio
        poses_pred.append(pred_pose[:3, 3])
        pred_poses_scaled.append(pred_pose_scaled)
        ratios_d.append(ratio)

    gt_poses_path = os.path.join(opt.data_path, "poses",
                                 "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4)
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1
    gt_xyzs = gt_global_poses[:, :3, 3]

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.linalg.inv(
                np.dot(np.linalg.inv(gt_global_poses[i - 1]),
                       gt_global_poses[i])))

    ates = []
    num_frames = gt_xyzs.shape[0]
    track_length = 5
    for i in range(0, num_frames - 1):
        local_xyzs = np.array(
            dump_xyz(pred_poses_scaled[i:i + track_length - 1]))
        gt_local_xyzs = np.array(
            dump_xyz(gt_local_poses[i:i + track_length - 1]))
        gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm(
            local_xyzs)
        ates.append(compute_ate(gt_local_xyzs, local_xyzs))
        gt_norms_div.append(gt_norm_div)
        gt_norms.append(np.linalg.norm(gt_local_xyzs))

    print("\n   Trajectory error: {:0.3f}, std: {:0.3f}\n".format(
        np.mean(ates), np.std(ates)))

    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_scaled{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_poses)
    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_gt{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_poses)
    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_pred{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_xyzs)
    save_path = os.path.join(os.path.dirname(__file__),
                             "gt_norms{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_norms)
    save_path = os.path.join(os.path.dirname(__file__),
                             "gt_norms_div{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_norms_div)
    save_path = os.path.join(os.path.dirname(__file__),
                             "ratios_d{:02d}.npy".format(sequence_id))
    np.save(save_path, ratios_d)
    save_path = os.path.join(os.path.dirname(__file__),
                             "pred_norms{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_norms)
    print("-> Predictions saved to", save_path)
Exemple #2
0
    def __init__(self, options):
        #pdb.set_trace()
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")
        test_path = os.path.join(os.path.dirname(__file__), "splits",
                                 'eigen_benchmark', "{}_files.txt")
        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        test_filenames = readlines(test_path.format("test"))

        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     self.num_scales,
                                     is_train=True,
                                     img_ext=img_ext,
                                     is_flow=True,
                                     args=self.opt)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   self.num_scales,
                                   is_train=False,
                                   img_ext=img_ext,
                                   is_flow=True,
                                   args=self.opt)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)

        vid_dataset_val = self.dataset(self.opt.data_path,
                                       sorted(val_filenames),
                                       self.opt.height,
                                       self.opt.width,
                                       self.opt.frame_ids,
                                       self.num_scales,
                                       is_train=False,
                                       img_ext=img_ext,
                                       is_flow=False,
                                       args=self.opt)

        vid_dataset_test = self.dataset(self.opt.data_path,
                                        sorted(test_filenames),
                                        self.opt.height,
                                        self.opt.width,
                                        self.opt.frame_ids,
                                        self.num_scales,
                                        is_train=False,
                                        img_ext=img_ext,
                                        is_flow=False,
                                        args=self.opt)

        self.vid_loader_val = DataLoader(vid_dataset_val,
                                         1,
                                         False,
                                         num_workers=0,
                                         pin_memory=True,
                                         drop_last=True)
        self.vid_loader_test = DataLoader(vid_dataset_test,
                                          1,
                                          False,
                                          num_workers=0,
                                          pin_memory=True,
                                          drop_last=True)

        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()
Exemple #3
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))

        filenames = readlines(
            os.path.join(splits_dir, opt.eval_split, "test_files.txt"))
        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path,
                                  map_location=torch.device("cuda:0"))

        dataset = datasets.KITTIRAWDataset(opt.data_path,
                                           filenames,
                                           encoder_dict['height'],
                                           encoder_dict['width'], [0],
                                           4,
                                           is_train=False)
        # dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers,
        #                         pin_memory=True, drop_last=False)
        dataloader = DataLoader(
            dataset,
            16,
            shuffle=False,
            num_workers=opt.num_workers,
            pin_memory=True,
            drop_last=False,
            collate_fn=my_collate_fn
        )  ## the default collate_fn will fail because there are non-deterministic length sample

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        model_dict = encoder.state_dict()
        encoder.load_state_dict(
            {k: v
             for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(
            torch.load(decoder_path, map_location=torch.device("cuda:0")))

        encoder.cuda(0)
        encoder.eval()
        depth_decoder.cuda(0)
        depth_decoder.eval()

        pred_disps = []

        print("-> Computing predictions with size {}x{}".format(
            encoder_dict['width'], encoder_dict['height']))

        with torch.no_grad():
            for data in dataloader:
                input_color = data[("color", 0, 0)].cuda(0)

                if opt.post_process:
                    # Post-processed results require each image to have two forward passes
                    input_color = torch.cat(
                        (input_color, torch.flip(input_color, [3])), 0)

                output = depth_decoder(encoder(input_color))

                pred_disp, _ = disp_to_depth(output[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(
                        pred_disp[:N], pred_disp[N:, :, ::-1])

                pred_disps.append(pred_disp)

        pred_disps = np.concatenate(pred_disps)

    else:
        # Load predictions from file
        print("-> Loading predictions from {}".format(opt.ext_disp_to_eval))
        pred_disps = np.load(opt.ext_disp_to_eval)

        if opt.eval_eigen_to_benchmark:
            eigen_to_benchmark_ids = np.load(
                os.path.join(splits_dir, "benchmark",
                             "eigen_to_benchmark_ids.npy"))

            pred_disps = pred_disps[eigen_to_benchmark_ids]

    if opt.save_pred_disps:
        output_path = os.path.join(opt.load_weights_folder,
                                   "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    elif opt.eval_split == 'benchmark':
        save_dir = os.path.join(opt.load_weights_folder,
                                "benchmark_predictions")
        print("-> Saving out benchmark predictions to {}".format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx in range(len(pred_disps)):
            disp_resized = cv2.resize(pred_disps[idx], (1216, 352))
            depth = STEREO_SCALE_FACTOR / disp_resized
            depth = np.clip(depth, 0, 80)
            depth = np.uint16(depth * 256)
            save_path = os.path.join(save_dir, "{:010d}.png".format(idx))
            cv2.imwrite(save_path, depth)

        print(
            "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done."
        )
        quit()

    # gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz")
    # gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths_im_ori.npz") ## ZMH: use the gt produced by vel_depth=False in generate_depth_map_original
    gt_path = os.path.join(
        splits_dir, opt.eval_split, "gt_depths_im_cus.npz"
    )  ## ZMH: use the gt produced by vel_depth=False in generate_depth_map_original
    ## ZMH:
    gt_depths = np.load(gt_path,
                        fix_imports=True,
                        encoding='latin1',
                        allow_pickle=True)["data"]
    # gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1')["data"]

    print("-> Evaluating")

    if opt.eval_stereo:
        print("   Stereo evaluation - "
              "disabling median scaling, scaling by {}".format(
                  STEREO_SCALE_FACTOR))
        opt.disable_median_scaling = True
        opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR
    else:
        print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []

    for i in range(pred_disps.shape[0]):

        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]

        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp

        if opt.eval_split == "eigen":
            mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)

            crop = np.array([
                0.40810811 * gt_height, 0.99189189 * gt_height,
                0.03594771 * gt_width, 0.96405229 * gt_width
            ]).astype(np.int32)
            crop_mask = np.zeros(mask.shape)
            crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
            mask = np.logical_and(mask, crop_mask)

        else:
            mask = gt_depth > 0

        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]

        pred_depth *= opt.pred_depth_scale_factor
        if not opt.disable_median_scaling:
            ratio = np.median(gt_depth) / np.median(pred_depth)
            ratios.append(ratio)
            pred_depth *= ratio

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH

        errors.append(compute_errors(gt_depth, pred_depth))

    if not opt.disable_median_scaling:
        ratios = np.array(ratios)
        med = np.median(ratios)
        print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(
            med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " +
          ("{:>8} | " * 7
           ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
    print("\n-> Done!")
Exemple #4
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    print("-> Loading weights from {}".format(opt.load_weights_folder))

    filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt"))
    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path)

    if opt.dataset == 'cityscape':
        dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames,
                                           encoder_dict['height'], encoder_dict['width'],
                                           [0], 4, is_train=False, tag=opt.dataset)
    elif opt.dataset == 'kitti':
        dataset = datasets.KITTIRAWDataset(
            opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'],
            [0,'s'], 4, tag='kitti', is_train=False, img_ext='png',
            load_meta=False, is_load_semantics=True,
            is_predicted_semantics=True, load_morphed_depth=False)
    else:
        raise ValueError("No predefined dataset")
    dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers,
                            pin_memory=True, drop_last=False)

    encoder = networks.ResnetEncoder(opt.num_layers, False)
    if opt.switchMode == 'on':
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel)
    else:
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

    model_dict = encoder.state_dict()
    encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(torch.load(decoder_path))

    encoder.cuda()
    encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()
    sfx = torch.nn.Softmax(dim=1)

    print("Evaluation starts")

    confMatrix = generateMatrix(args)
    nbPixels = 0
    count255 = 0
    with torch.no_grad():
        for idx, inputs in enumerate(dataloader):
            input_color = inputs[("color", 0, 0)].cuda()
            outputs = depth_decoder(encoder(input_color),computeSemantic = True, computeDepth = False)

            gt = inputs['seman_gt_eval'].cpu().numpy().astype(np.uint8)
            pred = sfx(outputs[('seman', 0)]).detach()
            pred = torch.argmax(pred, dim=1).type(torch.float).unsqueeze(1)
            pred = F.interpolate(pred, [gt.shape[1], gt.shape[2]], mode='nearest')
            pred = pred.squeeze(1).cpu().numpy().astype(np.uint8)
            # visualize_semantic(gt[0,:,:]).show()
            # visualize_semantic(pred[0,:,:]).show()

            groundTruthNp = gt
            predictionNp = pred
            nbPixels = nbPixels + groundTruthNp.shape[0] * groundTruthNp.shape[1] * groundTruthNp.shape[2]

            # encoding_value = max(groundTruthNp.max(), predictionNp.max()).astype(np.int32) + 1
            encoding_value = 256  # precomputed
            encoded = (groundTruthNp.astype(np.int32) * encoding_value) + predictionNp

            values, cnt = np.unique(encoded, return_counts=True)

            for value, c in zip(values, cnt):
                pred_id = value % encoding_value
                gt_id = int((value - pred_id) / encoding_value)
                if pred_id == 255 or gt_id == 255:
                    count255 = count255 + c
                    continue
                if not gt_id in args.evalLabels:
                    printError("Unknown label with id {:}".format(gt_id))
                confMatrix[gt_id][pred_id] += c
            print("Finish %dth batch" % idx)
    if confMatrix.sum() + count255 != nbPixels:
        printError(
            'Number of analyzed pixels and entries in confusion matrix disagree: contMatrix {}, pixels {}'.format(
                confMatrix.sum(), nbPixels))

    classScoreList = {}
    for label in args.evalLabels:
        labelName = trainId2label[label].name
        classScoreList[labelName] = getIouScoreForLabel(label, confMatrix, args)
    vals = np.array(list(classScoreList.values()))
    mIOU = np.mean(vals[np.logical_not(np.isnan(vals))])
    # if opt.save_pred_disps:
    #     output_path = os.path.join(
    #         opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split))
    #     print("-> Saving predicted disparities to ", output_path)
    #     np.save(output_path, pred_disps)

    print("mIOU is %f" % mIOU)
Exemple #5
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    print(args.image_path)
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(args.model_path)
    #download_model_if_doesnt_exist(args.model_path,args.model_name)

    model_path = os.path.join(args.model_path, args.model_name)

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    #1 LOADING PRETRAINED MODEL
    #1.1 encoder
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    #1.2 decoder
    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    #2. FINDING INPUT IMAGES

    in_path = Path(args.image_path)
    if args.out_path != None:
        out_path = Path(args.out_path)
    else:
        out_path = Path('./' + in_path.stem + '_out')

    out_path.mkdir_p()

    #3. PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for image_path in tqdm(in_path.files()):

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)  #torch.Size([1, 3, 192, 640])
            features = encoder(input_image)  #a list from 0 to 4
            outputs = depth_decoder(features)  # dict , 4 disptensor

            #            disp = outputs[("disp", 0,0)]# has a same size with input
            disp = outputs[("disp", 0)]  # has a same size with input

            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = image_path.stem
            if args.npy_out:
                name_dest_npy = os.path.join(out_path,
                                             "{}_disp.npy".format(output_name))
                scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
                np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            name_dest_im = os.path.join(out_path,
                                        "{}_disp.png".format(output_name))
            plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax)

    print('-> Done!')
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    K = np.array(
        [[0.5, 0, 0.5, 0], [0, 1.656, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)

    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))

        filenames = readlines(
            os.path.join(splits_dir, opt.eval_split, "test_files.txt"))
        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path)

        dataset = datasets.AirSimDataset(opt.data_path,
                                         filenames,
                                         encoder_dict['height'],
                                         encoder_dict['width'], [0],
                                         4,
                                         is_train=False)
        dataloader = DataLoader(dataset,
                                16,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True,
                                drop_last=False)

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        model_dict = encoder.state_dict()
        encoder.load_state_dict(
            {k: v
             for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(torch.load(decoder_path))

        encoder.cuda()
        encoder.eval()
        depth_decoder.cuda()
        depth_decoder.eval()

        pred_disps = []
        gt_depths = []

        print("-> Computing predictions with size {}x{}".format(
            encoder_dict['width'], encoder_dict['height']))

        with torch.no_grad():
            for data in dataloader:
                input_color = data[("color", 0, 0)].cuda()

                if opt.post_process:
                    # Post-processed results require each image to have two forward passes
                    input_color = torch.cat(
                        (input_color, torch.flip(input_color, [3])), 0)

                output = depth_decoder(encoder(input_color))

                pred_disp, _ = disp_to_depth(output[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(
                        pred_disp[:N], pred_disp[N:, :, ::-1])

                pred_disps.append(pred_disp)

        pred_disps = np.concatenate(pred_disps)

    else:
        # Load predictions from file
        print("-> Loading predictions from {}".format(opt.ext_disp_to_eval))
        pred_disps = np.load(opt.ext_disp_to_eval)

        if opt.eval_eigen_to_benchmark:
            eigen_to_benchmark_ids = np.load(
                os.path.join(splits_dir, "benchmark",
                             "eigen_to_benchmark_ids.npy"))

            pred_disps = pred_disps[eigen_to_benchmark_ids]

    if opt.eval_object:
        object_masks = []
        for line in filenames:
            line = line.split()
            folder, frame_index = line[0], int(line[1])

            object_mask_filename = os.path.join(
                os.path.dirname(__file__), "object_masks", folder,
                "{:010d}.npy".format(int(frame_index)))
            object_mask = np.load(object_mask_filename)
            object_masks.append(object_mask)

    if opt.save_pred_disps:
        output_path = os.path.join(opt.load_weights_folder,
                                   "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    elif opt.eval_split == 'benchmark':
        save_dir = os.path.join(opt.load_weights_folder,
                                "benchmark_predictions")
        print("-> Saving out benchmark predictions to {}".format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx in range(len(pred_disps)):
            disp_resized = cv2.resize(pred_disps[idx], (1216, 352))
            depth = STEREO_SCALE_FACTOR / disp_resized
            depth = np.clip(depth, 0, 80)
            depth = np.uint16(depth * 256)
            save_path = os.path.join(save_dir, "{:010d}.png".format(idx))
            cv2.imwrite(save_path, depth)

        print(
            "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done."
        )
        quit()
    gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz")
    gt_depths = np.load(gt_path,
                        fix_imports=True,
                        encoding='latin1',
                        allow_pickle=True)["data"]

    print("-> Evaluating")

    if opt.eval_stereo:
        print("   Stereo evaluation - "
              "disabling median scaling, scaling by {}".format(
                  STEREO_SCALE_FACTOR))
        opt.scaling = "disable"
        opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR
    else:
        print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []
    ratios_dgc = []
    ex_logs = []
    mean_scale = []
    side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
    #resize_ori = transforms.Resize((pred_disps.shape[1],pred_disps.shape[2]),interpolation=Image.ANTIALIAS)

    for i in range(pred_disps.shape[0]):
        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]
        line = filenames[i].split()
        folder = line[0]
        frame_index = line[1]
        side = side_map[line[2]]
        color = pil_loader(get_image_path(folder, int(frame_index), side))
        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp

        if (opt.eval_split == "eigen") | (opt.eval_split == "AirSim"):
            mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)
            '''
            crop = np.array(
                [0.40810811 * gt_height, 0.99189189 * gt_height,
                 0.03594771 * gt_width,  0.96405229 * gt_width]).astype(np.int32)
            crop_mask = np.zeros(mask.shape)
            crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
            mask = np.logical_and(mask, crop_mask)
            if opt.eval_object:
                object_mask = object_masks[i].astype(np.bool)
            '''

        else:
            mask = gt_depth > 0

        if opt.scaling == "gt":
            ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask])
            ratios.append(ratio)
            if opt.eval_object:
                mask = np.logical_and(mask, object_mask)
        #elif opt.scaling == "dgc":
            scale_recovery = ScaleRecovery(1, gt_height, gt_width, K).cuda()
            #scale_recovery = ScaleRecovery(1, 192, 640, K).cuda()
            pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
            ratio1 = scale_recovery(pred_depth)
            ratio = ratio1.cpu().item()
            ratios_dgc.append(ratio)
            pred_depth = pred_depth[0].cpu().numpy()
            '''
            surface_normal = surface_normal1.cpu()[0,:,:,:].numpy()
            ground_mask = ground_mask1.cpu()[0,0,:,:].numpy()
            pred_depth = pred_depth[0].cpu().numpy()
            
            if i==28:
                np.save('pred_disp28.npy',pred_disp)
                np.save('surface_normal28.npy',surface_normal)
                np.save('ground_mask28.npy',ground_mask)
                print(np.min(pred_depth),np.max(pred_depth),ratio)
            '''
        else:
            ratio = 1
        #print(ratio)
        #print(max(pred_depth))
        #print(min(pred_depth))

        pred_depth_ori = pred_depth * mask
        gt_depth_ori = gt_depth * mask
        pred_depth_ori = np.where(mask == 1, pred_depth_ori, 1)
        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]
        mean_scale.append(np.mean(gt_depth / pred_depth))

        error_try = 100
        scale_abs = 0
        for ratio_try in np.arange(0.1, 50, step=0.1):
            pred_depth1 = pred_depth * ratio_try
            error_tmp = compute_errors(gt_depth, pred_depth1)[0]
            #print(error_tmp)
            if error_tmp < error_try:
                error_try = error_tmp
                scale_abs = ratio_try
        ex_logs.append(scale_abs)
        div_scale = gt_depth_ori / pred_depth_ori
        #print(div_scale.shape)
        div_values1 = div_scale[mask]
        div_scale = (div_scale - scale_abs) / scale_abs
        div_values = div_scale[mask]
        #div_rmse = sqrt(sum((div_values1-scale_abs)*(div_values1-scale_abs))/len(div_values1))
        print("min,max value of div_values no abs is", min(div_values),
              max(div_values))
        #ex_logs.append([i,min(div_values), max(div_values), div_rmse,scale_abs])
        #print(div_scale.shape)
        #div_scale = div_scale/np.max(div_scale)

        mu = np.mean(div_values1)
        sigma = np.std(div_values1)
        print("min,max of div_values1 is", min(div_values1), max(div_values1))
        fig, ax = plt.subplots()
        n, bins, patches = ax.hist(div_values1,
                                   150,
                                   range=(3, 130),
                                   density=True)
        y = norm.pdf(bins, mu, sigma)
        ax.plot(bins, y, 'r')
        plt.xlabel('Scale')
        plt.ylabel('Density')
        plt.savefig(
            os.path.join(os.path.dirname(__file__), "hist_imgs_AirSim",
                         "{}.jpg".format(i)))
        plt.close()

        blending_imgs(div_scale, color, i, mask)

        pred_depth *= ratio
        ratios.append(ratio)

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
        #blending_imgs(div_scale, color,i,mask)

        if len(gt_depth) != 0:
            errors.append(compute_errors(gt_depth, pred_depth))
    ratios_dgc = np.array(ratios_dgc)
    ratios = np.array(ratios)
    np.save('ideal_scale_AirSim.npy', ex_logs)
    np.save('median_raitos_AirSim.npy', ratios)
    np.save('dgc_raitos_AirSim.npy', ratios_dgc)
    med = np.median(ratios)
    print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(
        med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " +
          ("{:>8} | " * 7
           ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")

    print("\n-> Done!")
Exemple #7
0
def test_simple_inputs(image_path, model_name, output_path, cuda_is_available):
    """Function to predict for a single image or folder of images
    """
    assert model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if cuda_is_available:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #download_model_if_doesnt_exist(model_name)
    model_path = os.path.join("models", model_name)
    #print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    #print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    #print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(image_path):
        # Only testing on a single image
        paths = [image_path]
        #output_directory = os.path.dirname(image_path)
        output_directory = os.path.dirname(output_path)
    elif os.path.isdir(image_path):
        # Searching folder for images
        paths = glob.glob(os.path.join(image_path, '*.{}'.format('.jpg')))
        output_directory = image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(image_path))

    #print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpg".format(output_name))
            plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax)
Exemple #8
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    print("-> Loading weights from {}".format(opt.load_weights_folder))

    filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt"))
    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path)

    if opt.use_stereo:
        opt.frame_ids.append("s")
    if opt.dataset == 'cityscape':
        dataset = datasets.CITYSCAPERawDataset(opt.data_path,
                                               filenames,
                                               encoder_dict['height'],
                                               encoder_dict['width'],
                                               opt.frame_ids,
                                               4,
                                               is_train=False,
                                               tag=opt.dataset,
                                               load_meta=True,
                                               is_sep_train_seman=False)
    elif opt.dataset == 'kitti':
        dataset = datasets.KITTIRAWDataset(opt.data_path,
                                           filenames,
                                           encoder_dict['height'],
                                           encoder_dict['width'],
                                           opt.frame_ids,
                                           4,
                                           is_train=False,
                                           tag=opt.dataset)
    else:
        raise ValueError("No predefined dataset")
    dataloader = DataLoader(dataset,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=True)

    encoder = networks.ResnetEncoder(opt.num_layers, False)
    if opt.switchMode == 'on':
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc,
                                              isSwitch=True,
                                              isMulChannel=opt.isMulChannel)
    else:
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

    model_dict = encoder.state_dict()
    encoder.load_state_dict(
        {k: v
         for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(torch.load(decoder_path))

    encoder.cuda()
    encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    # x = torch.ones(2, 2, requires_grad=True)
    # print(x)
    # y = x + 2 + x
    # y = y.detach()
    # print(y)
    # z = y * y * 3
    # out = z.mean()
    # print(z, out)
    # out.backward()
    # print(x.grad)

    ##--------------------Visualization parameter here----------------------------##
    sfx = torch.nn.Softmax(dim=1)
    mergeDisp = Merge_MultDisp(opt.scales,
                               batchSize=opt.batch_size,
                               isMulChannel=opt.isMulChannel)
    svRoot = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/figure_visual'
    index = 0
    isvisualize = True
    viewEdgeMerge = False
    isHist = False
    useGtSeman = True
    viewSurfaceNormal = True
    viewSelfOcclu = True
    viewDispUp = True
    viewSmooth = True
    viewMulReg = True
    viewBorderRegress = False
    viewBorderSimilarity = False
    viewRandomSample = True
    viewSemanReg = False
    viewDepthGuess = False
    height = 256
    width = 512
    tensor23dPts = Tensor23dPts()

    if isHist:
        rec = np.zeros((19, 100))

    if opt.isMulChannel:
        app = os.path.join('mulDispOn', opt.model_name)
    else:
        app = os.path.join('mulDispOff', opt.model_name)

    dirpath = os.path.join(svRoot, app)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    if viewSmooth:
        comSmooth = ComputeSmoothLoss().cuda()

    if viewEdgeMerge:
        comp1dgrad = Comp1dgrad().cuda()

    if viewSurfaceNormal:
        compsn = ComputeSurfaceNormal(height=height,
                                      width=width,
                                      batch_size=opt.batch_size).cuda()

    if viewSelfOcclu:
        selfclu = SelfOccluMask().cuda()

    if viewDispUp:
        compDispUp = ComputeDispUpLoss().cuda()

    if viewMulReg:
        objReg = ObjRegularization()
        objReg.cuda()

    if viewBorderRegress:
        borderRegress = BorderRegression()
        borderRegress.cuda()

    if viewRandomSample:
        rdSampleOnBorder = RandomSampleNeighbourPts()
        rdSampleOnBorder.cuda()

    if viewSemanReg:
        rdSampleSeman = RandomSampleBorderSemanPts()
        rdSampleSeman.cuda()

    if viewDepthGuess:
        depthGuess = DepthGuessesBySemantics(batchNum=opt.batch_size,
                                             width=width,
                                             height=height)
        depthGuess.cuda()
    # if viewBorderSimilarity:
    #     borderSim = BorderSimilarity()
    #     borderSim.cuda()
    with torch.no_grad():
        for idx, inputs in enumerate(dataloader):
            # if idx != 12:
            #     continue
            for key, ipt in inputs.items():
                if not (key == 'height' or key == 'width' or key == 'tag'
                        or key == 'cts_meta'):
                    inputs[key] = ipt.to(torch.device("cuda"))
            input_color = inputs[("color", 0, 0)].cuda()
            # input_color = torch.flip(input_color, dims=[3])
            features = encoder(input_color)
            outputs = dict()
            outputs.update(
                depth_decoder(features,
                              computeSemantic=True,
                              computeDepth=False))
            outputs.update(
                depth_decoder(features,
                              computeSemantic=False,
                              computeDepth=True))

            # view the processed semantic seperate training data
            # for viewInd in range(opt.batch_size):
            #     label = inputs['semanTrain_label']
            #     visualize_semantic(label[viewInd, 0, :, :].cpu().numpy()).show()
            #     fig_rgb = inputs['semanTrain_rgb'][viewInd, :, :, :].permute(1, 2, 0).cpu().numpy()
            #     fig_rgb = (fig_rgb * 255).astype(np.uint8)
            #     fig_rgb = pil.fromarray(fig_rgb)
            #     fig_rgb.show()

            if isHist:
                mulDisp = outputs[('mul_disp', 0)]
                scaled_disp, mulDepth = disp_to_depth(mulDisp, 0.1, 100)
                mulDepth = mulDepth.cpu()
                for i in range(mulDisp.shape[1]):
                    rec[i, :] += torch.histc(mulDepth[:, i, :, :],
                                             bins=100,
                                             min=0,
                                             max=100).numpy()

            if isvisualize:
                if useGtSeman:
                    # outputs[('mul_disp', 0)][:,2,:,:] = outputs[('mul_disp', 0)][:,2,:,:] * 0
                    # outputs[('mul_disp', 0)][:, 12, :, :] = outputs[('mul_disp', 0)][:, 12, :, :] * 0
                    mergeDisp(inputs, outputs, eval=False)
                else:
                    mergeDisp(inputs, outputs, eval=True)

                dispMap = outputs[('disp', 0)]
                scaled_disp, depthMap = disp_to_depth(dispMap, 0.1, 100)
                depthMap = depthMap * STEREO_SCALE_FACTOR
                # _, mul_depthMap = disp_to_depth(outputs[('mul_disp', 0)], 0.1, 100)
                # mul_depthMap = mul_depthMap * STEREO_SCALE_FACTOR

                if viewDispUp:
                    fig_dispup = compDispUp.visualize(scaled_disp,
                                                      viewindex=index)

                if viewSmooth:
                    rgb = inputs[('color_aug', 0, 0)]
                    smoothfig = comSmooth.visualize(rgb=rgb,
                                                    disp=scaled_disp,
                                                    viewindex=index)

                if useGtSeman:
                    fig_seman = tensor2semantic(inputs['seman_gt'],
                                                ind=index,
                                                isGt=True)
                else:
                    fig_seman = tensor2semantic(outputs[('seman', 0)],
                                                ind=index)

                if viewSemanReg:
                    foregroundType = [
                        11, 12, 13, 14, 15, 16, 17, 18
                    ]  # person, rider, car, truck, bus, train, motorcycle, bicycle
                    softmaxedSeman = F.softmax(outputs[('seman', 0)], dim=1)
                    forePredMask = torch.sum(
                        softmaxedSeman[:, foregroundType, :, :],
                        dim=1,
                        keepdim=True)
                    foreGtMask = torch.ones(dispMap.shape).cuda().byte()

                    for m in foregroundType:
                        foreGtMask = foreGtMask * (inputs['seman_gt'] != m)
                    foreGtMask = 1 - foreGtMask
                    foreGtMask = foreGtMask.float()

                    forePredMask[forePredMask > 0.5] = 1
                    forePredMask[forePredMask <= 0.5] = 0

                    forePredMask = foreGtMask
                    rdSampleSeman.visualizeBorderSample(dispMap,
                                                        forePredMask,
                                                        gtMask=foreGtMask,
                                                        viewIndex=index)

                    cm = plt.get_cmap('magma')
                    viewForePred = forePredMask[index, :, :, :].squeeze(
                        0).detach().cpu().numpy()
                    viewForePred = (cm(viewForePred) * 255).astype(np.uint8)
                    # pil.fromarray(viewForePred).show()

                    viewForeGt = foreGtMask[index, :, :, :].squeeze(
                        0).detach().cpu().numpy()
                    viewForeGt = (cm(viewForeGt) * 255).astype(np.uint8)
                    # pil.fromarray(viewForeGt).show()
                    forePredictCombined = np.concatenate(
                        [viewForePred, viewForeGt], axis=0)
                    # pil.fromarray(forePredictCombined).show()
                    pil.fromarray(forePredictCombined).save(
                        os.path.join(dirpath,
                                     str(idx) + '_fg.png'))

                if viewDepthGuess:
                    wallType = [2, 3, 4]  # Building, wall, fence
                    roadType = [0, 1, 9]  # road, sidewalk, terrain
                    foregroundType = [
                        5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18
                    ]  # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle

                    wallTypeMask = torch.ones(dispMap.shape).cuda().byte()
                    roadTypeMask = torch.ones(dispMap.shape).cuda().byte()
                    foreGroundMask = torch.ones(dispMap.shape).cuda().byte()

                    with torch.no_grad():
                        for m in wallType:
                            wallTypeMask = wallTypeMask * (inputs['seman_gt']
                                                           != m)
                        wallTypeMask = (1 - wallTypeMask).float()

                        for m in roadType:
                            roadTypeMask = roadTypeMask * (inputs['seman_gt']
                                                           != m)
                        roadTypeMask = (1 - roadTypeMask).float()

                        for m in foregroundType:
                            foreGroundMask = foreGroundMask * (
                                inputs['seman_gt'] != m)
                        foreGroundMask = (1 - foreGroundMask).float()
                    originalSieze = [2048, 1024]
                    # currentSize = np.array([dispMap.shape[3], dispMap.shape[2]])
                    # scaleFac = np.eye(4)
                    # scaleFac[0,0] = currentSize[0] / originalSieze[0]
                    # scaleFac[1,1] = currentSize[1] / originalSieze[1]
                    # scaleFac = torch.Tensor(scaleFac).view(1,4,4).repeat(opt.batch_size, 1, 1).cuda()
                    # scaledIntrinsic = scaleFac @ inputs['realIn']
                    scaledIntrinsic = inputs['realIn']
                    depthGuess.visualizeDepthGuess(
                        realDepth=depthMap,
                        dispAct=dispMap,
                        foredgroundMask=foreGroundMask,
                        wallTypeMask=wallTypeMask,
                        groundTypeMask=roadTypeMask,
                        intrinsic=scaledIntrinsic,
                        extrinsic=inputs['realEx'],
                        semantic=inputs['seman_gt_eval'],
                        cts_meta=inputs['cts_meta'],
                        viewInd=index)
                    # realDepth, foredgroundMask, wallTypeMask, groundTypeMask, intrinsic, extrinsic

                fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=index)
                fig_disp = tensor2disp(outputs[('disp', 0)], ind=index)
                fig_3d, veh_coord, veh_coord_gt = tensor23dPts.visualize3d(
                    depthMap,
                    ind=index,
                    intrinsic=inputs['cts_meta']['intrinsic'][index, :, :],
                    extrinsic=inputs['cts_meta']['extrinsic'][index, :, :],
                    gtmask=inputs['cts_meta']['mask'][index, :, :],
                    gtdepth=inputs['cts_meta']['depthMap'][index, :, :],
                    semanticMap=inputs['seman_gt_eval'][index, :, :])
                # check:
                # torch.inverse(inputs['invcamK'][index, :, :] @ inputs['realIn'][index, :, :]) - inputs['cts_meta']['extrinsic'][index, :, :]
                fig_grad = None

                if viewSurfaceNormal:
                    # surnorm = compsn.visualize(depthMap = depthMap, invcamK = inputs['invcamK'].cuda(), orgEstPts = veh_coord, gtEstPts = veh_coord_gt, viewindex = index)
                    surnorm = compsn.visualize(
                        depthMap=depthMap,
                        invcamK=inputs['invcamK'].cuda(),
                        orgEstPts=veh_coord,
                        gtEstPts=veh_coord_gt,
                        viewindex=index)
                    surnormMap = compsn(depthMap=depthMap,
                                        invcamK=inputs['invcamK'].cuda())

                if viewMulReg:
                    depthMapLoc = depthMap / STEREO_SCALE_FACTOR
                    skyId = 10
                    skyMask = inputs['seman_gt'] == skyId
                    skyerr = objReg.visualize_regularizeSky(depthMapLoc,
                                                            skyMask,
                                                            viewInd=index)

                    wallType = [2, 3, 4]  # Building, wall, fence
                    roadType = [0, 1, 9]  # road, sidewalk, terrain
                    permuType = [5, 7]  # Pole, traffic sign
                    chanWinSize = 5

                    wallMask = torch.ones_like(skyMask)
                    roadMask = torch.ones_like(skyMask)
                    permuMask = torch.ones_like(skyMask)

                    with torch.no_grad():
                        for m in wallType:
                            wallMask = wallMask * (inputs['seman_gt'] != m)
                        wallMask = 1 - wallMask
                        wallMask = wallMask[:, :, 1:-1, 1:-1]

                        for m in roadType:
                            roadMask = roadMask * (inputs['seman_gt'] != m)
                        roadMask = 1 - roadMask
                        roadMask = roadMask[:, :, 1:-1, 1:-1]

                        for m in permuType:
                            permuMask = permuMask * (inputs['seman_gt'] != m)
                        permuMask = 1 - permuMask
                        permuMask = permuMask[:, :, 1:-1, 1:-1]

                    BdErrFig, viewRdErrFig = objReg.visualize_regularizeBuildingRoad(
                        surnormMap, wallMask, roadMask, dispMap, viewInd=index)

                    padSize = int((chanWinSize - 1) / 2)
                    permuMask = permuMask[:, :, padSize:-padSize,
                                          padSize:-padSize]
                    surVarFig = objReg.visualize_regularizePoleSign(
                        surnormMap, permuMask, dispMap, viewInd=index)

                if viewBorderRegress:
                    foregroundType = [
                        5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18
                    ]  # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle
                    backgroundType = [
                        0, 1, 2, 3, 4, 8, 9, 10
                    ]  # road, sidewalk, building, wall, fence, vegetation, terrain, sky
                    suppressType = [255]  # Suppress no label lines
                    # foreGroundMask = torch.sum(inputs['seman_gt'][:, foregroundType, :, :], dim=1, keepdim=True)
                    # backGroundMask = torch.sum(inputs['seman_gt'][:, backgroundType, :, :], dim=1, keepdim=True)
                    foreGroundMask = torch.ones(dispMap.shape).cuda().byte()
                    backGroundMask = torch.ones(dispMap.shape).cuda().byte()
                    suppresMask = torch.ones(dispMap.shape).cuda().byte()

                    with torch.no_grad():
                        for m in foregroundType:
                            foreGroundMask = foreGroundMask * (
                                inputs['seman_gt'] != m)
                        foreGroundMask = 1 - foreGroundMask
                        for m in backgroundType:
                            backGroundMask = backGroundMask * (
                                inputs['seman_gt'] != m)
                        backGroundMask = 1 - backGroundMask
                        for m in suppressType:
                            suppresMask = suppresMask * (inputs['seman_gt'] !=
                                                         m)
                        suppresMask = 1 - suppresMask
                        suppresMask = suppresMask.float()
                        combinedMask = torch.cat(
                            [foreGroundMask, backGroundMask], dim=1).float()

                    # borderRegFig = borderRegress.visualize_computeBorder(dispMap, combinedMask, suppresMask = suppresMask, viewIndex=index)
                    borderRegFig = None

                else:
                    borderRegFig = None

                # if viewBorderSimilarity:
                #     foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17,
                #                       18]  # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle
                #     backgroundType = [0, 1, 2, 3, 4, 8, 9,
                #                       10]  # road, sidewalk, building, wall, fence, vegetation, terrain, sky
                #     suppressType = [255]  # Suppress no label lines
                #     foreGroundMask = torch.ones(dispMap.shape).cuda().byte()
                #     backGroundMask = torch.ones(dispMap.shape).cuda().byte()
                #     suppresMask = torch.ones(dispMap.shape).cuda().byte()
                #
                #     with torch.no_grad():
                #         for m in foregroundType:
                #             foreGroundMask = foreGroundMask * (inputs['seman_gt'] != m)
                #         foreGroundMask = 1 - foreGroundMask
                #         for m in backgroundType:
                #             backGroundMask = backGroundMask * (inputs['seman_gt'] != m)
                #         backGroundMask = 1 - backGroundMask
                #         for m in suppressType:
                #             suppresMask = suppresMask * (inputs['seman_gt'] != m)
                #         suppresMask = 1 - suppresMask
                #         suppresMask = suppresMask.float()
                #         combinedMask = torch.cat([foreGroundMask, backGroundMask], dim=1).float()
                #
                #     borderSimFig = borderSim.visualize_borderSimilarity(dispMap, foreGroundMask.float(), suppresMask = suppresMask, viewIndex=index)

                if viewRandomSample:
                    foregroundType = [
                        5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18
                    ]  # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle
                    backgroundType = [
                        0, 1, 2, 3, 4, 8, 9, 10
                    ]  # road, sidewalk, building, wall, fence, vegetation, terrain, sky
                    suppressType = [255]  # Suppress no label lines
                    foreGroundMask = torch.ones(dispMap.shape).cuda().byte()
                    backGroundMask = torch.ones(dispMap.shape).cuda().byte()
                    suppresMask = torch.ones(dispMap.shape).cuda().byte()

                    with torch.no_grad():
                        for m in foregroundType:
                            foreGroundMask = foreGroundMask * (
                                inputs['seman_gt'] != m)
                        foreGroundMask = 1 - foreGroundMask
                        for m in suppressType:
                            suppresMask = suppresMask * (inputs['seman_gt'] !=
                                                         m)
                        suppresMask = 1 - suppresMask
                        suppresMask = suppresMask.float()
                        foreGroundMask = foreGroundMask.float()

                    rdSampleOnBorder.visualize_randomSample(dispMap,
                                                            foreGroundMask,
                                                            suppresMask,
                                                            viewIndex=index)
                    # rdSampleOnBorder.randomSampleReg(dispMap, foreGroundMask)

                if viewEdgeMerge:
                    grad_disp = comp1dgrad(outputs[('mul_disp', 0)])
                    fig_grad = tensor2disp(grad_disp, ind=index, vmax=1)
                    fig_grad = fig_grad.resize([512, 256])

                if viewSelfOcclu:
                    fl = inputs[("K", 0)][:, 0, 0]
                    bs = torch.abs(inputs["stereo_T"][:, 0, 3])
                    clufig, suppressedDisp = selfclu.visualize(dispMap,
                                                               viewind=index)

                if fig_grad is not None:
                    grad_seman = (
                        np.array(fig_grad)[:, :, 0:3].astype(np.float) * 0.7 +
                        np.array(fig_seman).astype(np.float) * 0.3).astype(
                            np.uint8)
                    # combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_grad)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)]
                    combined = [
                        grad_seman,
                        np.array(fig_disp)[:, :, 0:3],
                        np.array(fig_rgb)
                    ]
                    combined = np.concatenate(combined, axis=1)
                else:
                    if viewSurfaceNormal and viewSelfOcclu:
                        surnorm = surnorm.resize([512, 256])
                        surnorm_mixed = pil.fromarray(
                            (np.array(surnorm) * 0.2 +
                             np.array(fig_disp)[:, :, 0:3] * 0.8).astype(
                                 np.uint8))
                        disp_seman = (
                            np.array(fig_disp)[:, :, 0:3].astype(np.float) *
                            0.8 +
                            np.array(fig_seman).astype(np.float) * 0.2).astype(
                                np.uint8)
                        supprressed_disp_seman = (
                            np.array(suppressedDisp)[:, :, 0:3].astype(
                                np.float) * 0.8 +
                            np.array(fig_seman).astype(np.float) * 0.2).astype(
                                np.uint8)
                        rgb_seman = (
                            np.array(fig_seman).astype(np.float) * 0.5 +
                            np.array(fig_rgb).astype(np.float) * 0.5).astype(
                                np.uint8)

                        # clud_disp = (np.array(clufig)[:, :, 0:3].astype(np.float) * 0.3 + np.array(fig_disp)[:, :, 0:3].astype(
                        #     np.float) * 0.7).astype(np.uint8)
                        comb1 = np.concatenate([
                            np.array(supprressed_disp_seman)[:, :, 0:3],
                            np.array(suppressedDisp)[:, :, 0:3]
                        ],
                                               axis=1)
                        comb2 = np.concatenate([
                            np.array(disp_seman)[:, :, 0:3],
                            np.array(fig_disp)[:, :, 0:3]
                        ],
                                               axis=1)
                        comb3 = np.concatenate([
                            np.array(surnorm_mixed)[:, :, 0:3],
                            np.array(surnorm)[:, :, 0:3]
                        ],
                                               axis=1)
                        comb4 = np.concatenate([
                            np.array(fig_seman)[:, :, 0:3],
                            np.array(rgb_seman)[:, :, 0:3]
                        ],
                                               axis=1)
                        comb6 = np.concatenate([
                            np.array(clufig)[:, :, 0:3],
                            np.array(fig_dispup)[:, :, 0:3]
                        ],
                                               axis=1)

                        fig3dsize = np.ceil(
                            np.array([
                                comb4.shape[1], comb4.shape[1] /
                                fig_3d.size[0] * fig_3d.size[1]
                            ])).astype(np.int)
                        comb5 = np.array(fig_3d.resize(fig3dsize))
                        # combined = np.concatenate([comb1, comb6, comb2, comb3, comb4, comb5], axis=0)
                        combined = np.concatenate([comb1, comb2, comb4, comb3],
                                                  axis=0)
                    else:
                        disp_seman = (
                            np.array(fig_disp)[:, :, 0:3].astype(np.float) *
                            0.8 +
                            np.array(fig_seman).astype(np.float) * 0.2).astype(
                                np.uint8)
                        rgb_seman = (
                            np.array(fig_seman).astype(np.float) * 0.5 +
                            np.array(fig_rgb).astype(np.float) * 0.5).astype(
                                np.uint8)
                        # combined = [np.array(disp_seman)[:,:,0:3], np.array(fig_disp)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)]
                        combined = [
                            np.array(disp_seman)[:, :, 0:3],
                            np.array(fig_disp)[:, :, 0:3],
                            np.array(fig_seman),
                            np.array(rgb_seman)
                        ]
                        combined = np.concatenate(combined, axis=1)

                fig = pil.fromarray(combined)
                # fig.show()
                fig.save(os.path.join(dirpath, str(idx) + '.png'))
                if borderRegFig is not None:
                    borderRegFig.save(
                        os.path.join(dirpath,
                                     str(idx) + '_borderRegress.png'))
                # fig_3d.save(os.path.join(dirpath, str(idx) + '_fig3d.png'))
                # for k in range(10):
                #     fig_disp = tensor2disp(outputs[('disp', 0)], ind=k)
                #     fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=k)
                #     combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_rgb)]
                #     combined = np.concatenate(combined, axis=1)
                #     fig = pil.fromarray(combined)
                #     fig.save(
                #         os.path.join('/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/MoredispOrg' + str(k) + '.png'))

                # fig_rgb.save(os.path.join(svRoot, app, 'rgb' + str(idx) + '.png'))
                # fig_seman.save(os.path.join(svRoot, app, 'semantic'+ str(idx) + '.png'))
                # fig_disp.save(os.path.join(svRoot, app, 'disp'+ str(idx) + '.png'))
                # a = inputs['seman_gt_eval']
                # scaled_disp, _ = disp_to_depth(outputs[('disp', 0)], 0.1, 100)
                print("%dth saved" % idx)
    # If compute the histogram
    if isHist:
        svPath = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/mul_channel_depth'
        carId = 13
        prob = copy.deepcopy(rec)
        ind = np.arange(prob.shape[1] * 2)
        for i in range(prob.shape[0]):
            prob[i, :] = prob[i, :] / np.sum(prob[i, :])
        for i in range(prob.shape[0]):
            trainStr = trainId2label[i][0]
            fig, ax = plt.subplots()
            rects1 = ax.bar(ind[0::2], prob[carId, :], label='obj:car')
            rects2 = ax.bar(ind[1::2], prob[i, :], label='obj:' + trainStr)
            ax.set_ylabel('Meter in percentile')
            ax.set_xlabel('Meters')
            ax.set_title('Scale Changes between scale car and scale %s' %
                         trainStr)
            ax.legend()
            plt.savefig(os.path.join(svPath, str(i)), dpi=200)
            plt.close(fig)
def main():
    global args
    checkpoint = None
    is_eval = False
    if args.evaluate:
        args_new = args
        if os.path.isfile(args.evaluate):
            print("=> loading checkpoint '{}' ... ".format(args.evaluate),
                  end='')
            checkpoint = torch.load(args.evaluate, map_location=device)
            args = checkpoint['args']
            args.data_folder = args_new.data_folder
            args.val = args_new.val
            is_eval = True
            print("Completed.")
        else:
            print("No model found at '{}'".format(args.evaluate))
            return
    elif args.resume:  # optionally resume from a checkpoint
        args_new = args
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}' ... ".format(args.resume),
                  end='')
            checkpoint = torch.load(args.resume, map_location=device)
            args.start_epoch = checkpoint['epoch'] + 1
            args.data_folder = args_new.data_folder
            args.val = args_new.val
            print("Completed. Resuming from epoch {}.".format(
                checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))
            return

    ################# model

    print("=> creating model and optimizer ... ", end='')
    parameters_to_train = []
    encoder = networks.ResnetEncoder(num_layers=18)
    encoder.to(device)
    parameters_to_train += list(encoder.parameters())
    decoder = networks.DepthDecoder(encoder.num_ch_enc)
    decoder.to(device)
    parameters_to_train += list(decoder.parameters())
    # encoder_named_params = [
    #     p for _, p in encoder.named_parameters() if p.requires_grad
    # ]
    optimizer = torch.optim.Adam(parameters_to_train,
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    encoder = torch.nn.DataParallel(encoder)
    decoder = torch.nn.DataParallel(decoder)
    model = [encoder, decoder]
    print("completed.")
    # if checkpoint is not None:
    #     model.load_state_dict(checkpoint['model'])
    #     optimizer.load_state_dict(checkpoint['optimizer'])
    #     print("=> checkpoint state loaded.")

    # Data loading code
    print("=> creating data loaders ... ")
    if not is_eval:
        train_dataset = KittiDepth('train', args)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   pin_memory=True,
                                                   sampler=None)
        print("\t==> train_loader size:{}".format(len(train_loader)))
    val_dataset = KittiDepth('val', args)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=12,  #1
        shuffle=False,
        num_workers=2,
        pin_memory=True)  # set batch size to be 1 for validation
    print("\t==> val_loader size:{}".format(len(val_loader)))

    ##############################################################

    # create backups and results folder
    logger = helper.logger(args)
    # if checkpoint is not None:
    #     logger.best_result = checkpoint['best_result']
    print("=> logger created.")

    if is_eval:
        print("=> starting model evaluation ...")
        result, is_best = iterate("val", args, val_loader, model, None, logger,
                                  checkpoint['epoch'])
        return

    # main loop
    print("=> starting main loop ...")
    for epoch in range(args.start_epoch, args.epochs):
        print("=> starting training epoch {} ..".format(epoch))
        iterate("train", args, train_loader, model, optimizer, logger,
                epoch)  # train for one epoch
        result, is_best = iterate("val", args, val_loader, model, None, logger,
                                  epoch)  # evaluate on validation set
Exemple #10
0
def main(args):
    """Function to predict for a single image or folder of images
    """
    print(args.dataset_path)
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #download_model_if_doesnt_exist(args.model_path,args.model_name)

    model_path = Path(args.model_path) / args.model_name
    if not model_path.exists():
        print(model_path + " does not exists")

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    #1 LOADING PRETRAINED MODEL
    #1.1 encoder
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    #1.2 decoder
    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    #2. FINDING INPUT IMAGES

    dataset_path = Path(args.dataset_path)

    #files
    root = Path(os.path.dirname(__file__))
    txt = root / 'splits' / args.split / args.txt_files
    print('-> inference file: ', txt)
    rel_paths = readlines(txt)
    #out
    if args.out_path != None:
        out_path = Path(args.out_path)
    else:
        out_path = Path('./' + dataset_path.stem + '_out')
    out_path.mkdir_p()

    files = []
    #rel_paths 2 paths
    if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']:  #kitti
        for item in rel_paths:
            item = item.split(' ')
            if item[2] == 'l': camera = 'image_02'
            elif item[2] == 'r': camera = 'image_01'
            files.append(dataset_path / item[0] / camera / 'data' /
                         "{:010d}.png".format(int(item[1])))
    elif args.split == 'mc':
        for item in rel_paths:
            #item = item.split('/')
            files.append(item)
    elif args.split == 'visdrone' or 'visdrone_lite':
        for item in rel_paths:
            item = item.split('/')
            files.append(dataset_path / item[0] / item[1] + '.jpg')
    else:
        for item in rel_paths:
            item = item.split('/')
            files.append(dataset_path / item[0] / item[1] + '.jpg')


#2.1

    cnt = 0
    #3. PREDICTING ON EACH IMAGE IN TURN
    print('\n-> inference ' + args.dataset_path)
    files.sort()
    for image_path in tqdm(files):

        # Load image and preprocess

        if args.split == 'mc':
            input_image = pil.open(dataset_path / image_path +
                                   '.png').convert('RGB')
        else:
            input_image = pil.open(image_path).convert('RGB')

        original_width, original_height = input_image.size
        input_image = input_image.resize((feed_width, feed_height),
                                         pil.LANCZOS)
        input_image = transforms.ToTensor()(input_image).unsqueeze(0)

        # PREDICTION
        input_image = input_image.to(device)  #torch.Size([1, 3, 192, 640])
        features = encoder(input_image)  #a list from 0 to 4
        outputs = depth_decoder(features)  # dict , 4 disptensor
        cnt += 1
        disp = outputs[("disp", 0)]  # has a same size with input
        disp_resized = torch.nn.functional.interpolate(
            disp, (original_height, original_width),
            mode="bilinear",
            align_corners=False)

        # Saving numpy file
        #if args.out_name=='num':
        if args.split == 'eigen' or args.split == 'custom':
            output_name = str(image_path).split('/')[-4] + '_{}'.format(
                image_path.stem)
        elif args.split == 'mc':
            block, p, color, frame = image_path.split('/')
            output_name = str(image_path).replace('/', '_') + '.png'
        elif args.split == 'visdrone' or args.split == 'visdrone_lite':
            output_name = image_path.relpath(dataset_path).strip(
                '.jpg').replace('/', '_')
            pass
        elif args.split == 'custom_mono':
            output_name = image_path.relpath(dataset_path).strip(
                '.jpg').replace('/', '_')
        else:
            output_name = image_path.relpath(dataset_path).strip(
                '.jpg').replace('/', '_')

        if args.npy_out:
            name_dest_npy = os.path.join(out_path,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

        # Saving colormapped depth image
        disp_resized_np = disp_resized.squeeze().cpu().numpy()
        vmax = np.percentile(disp_resized_np, 95)
        name_dest_im = Path(out_path) / "{}.png".format(output_name)
        plt.imsave(name_dest_im, disp_resized_np, cmap='magma', vmax=vmax)

    print(cnt)

    print('\n-> Done,save at ' + args.out_path)
Exemple #11
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80
    selected_frame = 100

    K = np.array([[0.58, 0, 0.5, 0],
                  [0, 1.92, 0.5, 0],
                  [0, 0, 1, 0],
                  [0, 0, 0, 1]], dtype=np.float32)

    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))

        sequence_id = 0
        filenames = readlines(os.path.join(os.path.dirname(__file__), "splits", "odom",
                     "test_files_{:02d}.txt".format(sequence_id)))
        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path)

        dataset = datasets.KITTIOdomDataset(
            opt.data_path, filenames, opt.height, opt.width,
            [0, 1], 4, is_train=False)
        dataloader = DataLoader(
            dataset, 16, shuffle=False, num_workers=opt.num_workers,
            pin_memory=True, drop_last=False)

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        model_dict = encoder.state_dict()
        encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(torch.load(decoder_path))

        encoder.cuda()
        encoder.eval()
        depth_decoder.cuda()
        depth_decoder.eval()

        pred_disps = []

        print("-> Computing predictions with size {}x{}".format(
            encoder_dict['width'], encoder_dict['height']))

        with torch.no_grad():
            for data in dataloader:
                input_color = data[("color", 0, 0)].cuda()

                if opt.post_process:
                    # Post-processed results require each image to have two forward passes
                    input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0)

                output = depth_decoder(encoder(input_color))

                pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(pred_disp[:N], pred_disp[N:, :, ::-1])

                pred_disps.append(pred_disp)

        pred_disps = np.concatenate(pred_disps)

    else:
        # Load predictions from file
        print("-> Loading predictions from {}".format(opt.ext_disp_to_eval))
        pred_disps = np.load(opt.ext_disp_to_eval)

        if opt.eval_eigen_to_benchmark:
            eigen_to_benchmark_ids = np.load(
                os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy"))

            pred_disps = pred_disps[eigen_to_benchmark_ids]

    if opt.eval_object:
        object_masks = []
        for line in filenames:
            line = line.split()
            folder, frame_index = line[0], int(line[1])

            object_mask_filename = os.path.join(
                os.path.dirname(__file__),
                "object_masks",
                folder,
                "{:010d}.npy".format(int(frame_index)))
            object_mask = np.load(object_mask_filename)
            object_masks.append(object_mask)

    if opt.save_pred_disps:
        output_path = os.path.join(
            opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    elif opt.eval_split == 'benchmark':
        save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions")
        print("-> Saving out benchmark predictions to {}".format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx in range(len(pred_disps)):
            disp_resized = cv2.resize(pred_disps[idx], (1216, 352))
            depth = STEREO_SCALE_FACTOR / disp_resized
            depth = np.clip(depth, 0, 80)
            depth = np.uint16(depth * 256)
            save_path = os.path.join(save_dir, "{:010d}.png".format(idx))
            cv2.imwrite(save_path, depth)

        print("-> No ground truth is available for the KITTI benchmark, so not evaluating. Done.")
        quit()

    gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths_odom_00.npz")
    gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"]
    pred_poses = np.load('pred_poses_T.npy')
    norms_divs = np.load('gt_norms_div00.npy')
    scales_dgc = np.load('ratios_of_odom.npy')
    '''
    gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4)
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1
    gt_xyzs = gt_global_poses[:, :3, 3]

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.linalg.inv(np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i])))
    '''

    print("-> Evaluating")

    if opt.eval_stereo:
        print("   Stereo evaluation - "
            "disabling median scaling, scaling by {}".format(STEREO_SCALE_FACTOR))
        opt.scaling = "disable" 
        opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR
    else:
        print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []
    ex_logs = []
    mean_scale = []
    side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
    #resize_ori = transforms.Resize((pred_disps.shape[1],pred_disps.shape[2]),interpolation=Image.ANTIALIAS)

    for i in range(pred_disps.shape[0]):
        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]
        line = filenames[i].split()
        folder = line[0]
        frame_index = line[1]
        side = side_map[line[2]]
        color = pil_loader(get_image_path(folder,int(frame_index),side))
        if i==selected_frame:
            color_grad = compute_grad(color)     
            color_next = pil_loader(get_image_path(folder,int(frame_index)+1,side))
        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp
        
        if opt.eval_split == "eigen":
            mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)

            crop = np.array(
                [0.40810811 * gt_height, 0.99189189 * gt_height,
                 0.03594771 * gt_width,  0.96405229 * gt_width]).astype(np.int32)
            crop_mask = np.zeros(mask.shape)
            crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
            mask = np.logical_and(mask, crop_mask)
            if opt.eval_object:
                object_mask = object_masks[i].astype(np.bool)

        else:
            mask = gt_depth > 0
        
        if opt.scaling == "gt":
            ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask])
            if opt.eval_object:
                mask = np.logical_and(mask, object_mask)
        elif opt.scaling == "dgc":
            scale_recovery = ScaleRecovery(1, gt_height, gt_width, K).cuda()
            #scale_recovery = ScaleRecovery(1, 192, 640, K).cuda()
            pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
            ratio1,surface_normal1,ground_mask1,_,_,_,_ = scale_recovery(pred_depth)
            ratio = ratio1.cpu().item()
            
            surface_normal = surface_normal1.cpu()[0,:,:,:].numpy()
            ground_mask = ground_mask1.cpu()[0,0,:,:].numpy()
            pred_depth = pred_depth[0].cpu().numpy()
        else:
            ratio = 1
        #print(ratio)
        #print(max(pred_depth))
        #print(min(pred_depth))
        if i==selected_frame:
            cords = find_cord(color_grad, mask)
            selected_points(cords, color, i)
            min_gt = gt_depth[cords[0][0]][cords[0][1]]
            max_gt = gt_depth[cords[1][0]][cords[1][1]]
            median_gt = gt_depth[cords[2][0]][cords[2][1]]
            print("min max median gt depths are", min_gt, max_gt, median_gt)

            to_tensor = transforms.ToTensor()
            color_tens = to_tensor(color)
            color_tens_next = to_tensor(color_next).unsqueeze(0)
            pred_pose = pred_poses[i]
            norms_div = norms_divs[i]
            scale_dgc = scales_dgc[i]
            pred_pose_tens = torch.from_numpy(pred_pose).unsqueeze(0).cuda()
            t_norm = np.linalg.norm(pred_pose[:3, 3])
            print("gt_depth of min max median divided by norms of translation and scale of norm", min_gt/(t_norm*norms_div), max_gt/(t_norm*norms_div), median_gt/(t_norm*norms_div))
            print("gt_depth of min max median divided by norms of translation and scale of dgc", min_gt/(t_norm*scale_dgc), max_gt/(t_norm*scale_dgc), median_gt/(t_norm*scale_dgc))

            depth_tens = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
            project_3d = Project3D(1, gt_height, gt_width).cuda()
            backproject_depth = BackprojectDepth(1, gt_height, gt_width).cuda()
            K_tens = torch.from_numpy(K).unsqueeze(0).cuda()
            inv_K = np.linalg.pinv(K)
            inv_K = torch.from_numpy(inv_K).unsqueeze(0).cuda()
            cam_points = backproject_depth(depth_tens, inv_K,torch.from_numpy(cords[2]).cuda())
            pix_coords = np.array(project_3d(cam_points, K_tens, pred_pose_tens))	
            #print(pix_coords.shape)
            #pix_coords = pix_coords[0,:,:,:]
            l1_losses = []
            ssim_losses = []
            reprojection_losses = []
             
            for pix_coord in pix_coords:
                pix_coord_tens = torch.from_numpy(pix_coord).unsqueeze(0)
                pred = F.grid_sample(color_tens_next, pix_coord_tens, padding_mode="border")
                l1_loss, ssim_loss, reprojection_loss = compute_reprojection_loss(pred, color_tens.unsqueeze(0),cords[2])
                l1_losses.append(l1_loss)
                ssim_losses.append(ssim_loss)
                reprojection_losses.append(reprojection_loss)
            min_loss_pixel_index = np.argmin(reprojection_losses)
            visual_reprojection(color,cords[2],pix_coords[min_loss_pixel_index,cords[2,0],cords[2,1]],selected_frame)

        pred_depth_ori = pred_depth*mask
        gt_depth_ori = gt_depth*mask
        pred_depth_ori = np.where(mask==1,pred_depth_ori,1)
        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]
        mean_scale.append(np.mean(gt_depth/pred_depth))
        
        '''
        mu = np.mean(div_values1)
        sigma = np.std(div_values1)
        #print(min(div_values1),max(div_values1))
        fig,ax=plt.subplots()
        n, bins, patches = ax.hist(div_values1,150,range=(3,130),density = True)
        y = norm.pdf(bins, mu, 0.8*sigma)
        ax.plot(bins, y, 'r')
        plt.xlabel('Scale')
        plt.ylabel('Density')
        plt.savefig(os.path.join(os.path.dirname(__file__), "hist_imgs2","{:010d}.jpg".format(i)))
        plt.close()
        
        #blend_img = blending_imgs(div_scale, color,i)
        #blend_img.save(os.path.join(os.path.dirname(__file__), "blend_imgs","{:010d}.jpg".format(i)))
        
        
        blending_imgs(surface_normal,color,i,'surface_normals')
        blending_imgs(ground_mask,color,i,'ground_masks')
        '''
        pred_depth *= ratio
        ratios.append(ratio)

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
        #blending_imgs(div_scale, color,i,mask)

        if len(gt_depth) != 0:
            errors.append(compute_errors(gt_depth, pred_depth))
    save_path = os.path.join(os.path.dirname(__file__), "l1_losses_{}.npy".format(selected_frame))
    np.save(save_path, l1_losses)
    save_path = os.path.join(os.path.dirname(__file__), "ssim_losses_{}.npy".format(selected_frame))
    np.save(save_path, ssim_losses)
    save_path = os.path.join(os.path.dirname(__file__), "reprojection_losses_{}.npy".format(selected_frame))
    np.save(save_path, reprojection_losses)
    ratios = np.array(ratios)
    med = np.median(ratios)
    print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " + ("{:>8} | " * 7).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
    
    print("\n-> Done!")
Exemple #12
0
def main_with_masks(args):
    """Function to predict for a single image or folder of images
    """
    print(args.dataset_path)
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    out_path = Path(args.out_path)
    out_path.mkdir_p()
    dirs = {}
    for mask in args.results:
        dirs[mask] = (out_path / mask)
        (out_path / mask).mkdir_p()

    print('-> split:{}'.format(args.split))
    print('-> save to {}'.format(args.out_path))

    if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']:
        feed_height = 192
        feed_width = 640
        min_depth = 0.1
        max_depth = 80
        full_height = 375
        full_width = 1242
        dataset = KITTIRAWDataset

    elif args.split in ["visdrone", "visdrone_lite"]:
        feed_width = 352
        feed_height = 192
        min_depth = 0.1
        max_depth = 255
        dataset = VSDataset
    elif args.split in ['mc', 'mc_lite']:
        feed_height = 288
        feed_width = 384
        min_depth = 0.1
        max_depth = 255
        dataset = MCDataset

    feed_height = 192
    feed_width = 640

    backproject_depth = BackprojectDepth(1, feed_height, feed_width).to(device)

    project_3d = Project3D(1, feed_height, feed_width)

    photometric_error = PhotometricError()

    txt_files = args.txt_files
    #data
    test_path = Path(args.wk_root) / "splits" / args.split / txt_files
    test_filenames = readlines(test_path)
    if args.as_name_sort:  #按照序列顺序名字排列
        test_filenames.sort()
    #check filenames:
    i = 0
    for i, item in enumerate(test_filenames):
        #item = test_filenames[i]
        if args.split in ['eigen', 'custom', 'custom_lite', 'eigen_zhou']:
            dirname, frame, lr = test_filenames[i].split()
            files = (Path(args.dataset_path) / dirname /
                     'image_02/data').files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''
        if args.split in ['mc', 'mc_lite']:  #虽然在split的时候已经处理过了
            block, trajactory, color, frame = test_filenames[i].split('/')
            files = (Path(args.dataset_path) / block / trajactory /
                     color).files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''
            pass
        if args.split in ['visdrone', 'visdrone_lite']:  #虽然在split的时候已经处理过了
            dirname, frame = test_filenames[i].split('/')
            files = (Path(args.dataset_path) / dirname).files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''

    while '' in test_filenames:
        test_filenames.remove('')

    test_dataset = dataset(  # KITTIRAWData
        args.dataset_path,
        test_filenames,
        feed_height,
        feed_width,
        args.frame_ids,
        1,
        is_train=False,
        img_ext=args.ext)

    test_loader = DataLoader(  # train_datasets:KITTIRAWDataset
        dataset=test_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=1,
        pin_memory=True,
        drop_last=False)

    print('->items num: {}'.format(len(test_loader)))

    #layers

    #download_model_if_doesnt_exist(args.model_path,args.model_name)

    model_path = Path(args.model_path) / args.model_name
    if not model_path.exists():
        print(model_path + " does not exists")

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    #1 LOADING PRETRAINED MODEL
    #1.1 encoder
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    #1.2 decoder
    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    #paths
    pose_encoder_path = Path(model_path) / "pose_encoder.pth"
    pose_decoder_path = Path(model_path) / 'pose.pth'

    # 2.1 pose encoder
    print("   Loading pretrained pose encoder")

    pose_encoder = networks.ResnetEncoder(18, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.to(device)
    pose_encoder.eval()

    # 2.2 pose decoder
    print("   Loading pretrained decoder")
    pose_decoder = networks.PoseDecoder(num_ch_enc=pose_encoder.num_ch_enc,
                                        num_input_features=1,
                                        num_frames_to_predict_for=2)

    pose_loaded_dict = torch.load(pose_decoder_path, map_location=device)
    pose_decoder.load_state_dict(pose_loaded_dict)

    pose_decoder.to(device)
    pose_decoder.eval()
    source_scale = 0
    scale = 0
    for batch_idx, inputs in tqdm(enumerate(test_loader)):
        for key, ipt in inputs.items():
            inputs[key] = ipt.to(device)
        features = encoder(inputs[("color", 0, 0)])  # a list from 0 to 4

        outputs = depth_decoder(features)  # dict , 4 disptensor

        disp = outputs[("disp", 0)]  # has a same size with input

        #disp_resized = torch.nn.functional.interpolate(disp, (full_height, full_width), mode="bilinear", align_corners=False)

        _, depth = disp_to_depth(disp, min_depth, max_depth)

        for f_i in [args.frame_ids[0], args.frame_ids[-1]]:

            if f_i < 0:
                pose_inputs = [
                    inputs[("color", f_i, 0)], inputs[("color", 0, 0)]
                ]
            else:
                pose_inputs = [
                    inputs[("color", 0, 0)], inputs[("color", f_i, 0)]
                ]
            pose_inputs = torch.cat(pose_inputs, 1)
            features = pose_encoder(pose_inputs)
            axisangle, translation = pose_decoder([features])

            outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters(
                axisangle[:, 0], translation[:, 0], invert=(f_i < 0))  # b44
            T = outputs[("cam_T_cam", 0, f_i)]

            cam_points = backproject_depth(depth,
                                           inputs[("inv_K", 0)])  # D@K_inv
            pix_coords = project_3d(cam_points, inputs[("K", 0)],
                                    T)  # K@D@K_inv

            outputs[("sample", f_i, 0)] = pix_coords  # rigid_flow

            outputs[("color", f_i,
                     0)] = F.grid_sample(inputs[("color", f_i, 0)],
                                         outputs[("sample", f_i, 0)],
                                         padding_mode="border")
            # output"color" 就是i-warped

            # add a depth warp
            outputs[("color_identity", f_i, 0)] = inputs[("color", f_i, 0)]

        target = inputs[("color", 0, 0)]

        reprojection_losses = []
        for frame_id in [args.frame_ids[0], args.frame_ids[-1]]:
            pred = outputs[("color", frame_id, 0)]
            reprojection_losses.append(photometric_error.run(pred, target))

        reprojection_losses = torch.cat(reprojection_losses, 1)

        identity_reprojection_losses = []
        for frame_id in [args.frame_ids[0], args.frame_ids[-1]]:
            pred = inputs[("color", frame_id, source_scale)]
            identity_reprojection_losses.append(
                photometric_error.run(pred, target))
        identity_reprojection_losses = torch.cat(identity_reprojection_losses,
                                                 1)

        erro_maps = torch.cat(
            (identity_reprojection_losses, reprojection_losses), dim=1)  # b4hw

        identical_mask = IdenticalMask(erro_maps)
        identical_mask = identical_mask[0].detach().cpu().numpy()

        save_name = test_filenames[batch_idx].replace('/', '_')
        save_name = save_name.replace('l', '')
        save_name = save_name.replace('r', '')
        save_name = save_name.replace(' ', '')

        if "identical_mask" in args.results:
            plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name),
                       identical_mask)

        if "depth" in args.results:
            # Saving colormapped depth image
            disp_np = disp[0, 0].detach().cpu().numpy()
            vmax = np.percentile(disp_np, 95)
            plt.imsave(dirs['depth'] / "{}.png".format(save_name),
                       disp_np,
                       cmap='magma',
                       vmax=vmax)

        if "mean_mask" in args.results:
            mean_mask = MeanMask(erro_maps)
            mean_mask = mean_mask[0].detach().cpu().numpy()
            plt.imsave(dirs['mean_mask'] / "{}.png".format(save_name),
                       mean_mask,
                       cmap='bone')

        if "identical_mask" in args.results:
            identical_mask = IdenticalMask(erro_maps)
            identical_mask = identical_mask[0].detach().cpu().numpy()
            plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name),
                       identical_mask,
                       cmap='bone')

        if "var_mask" in args.results:
            var_mask = VarMask(erro_maps)
            var_mask = var_mask[0].detach().cpu().numpy()
            plt.imsave(dirs["var_mask"] / "{}.png".format(save_name),
                       var_mask,
                       cmap='bone')

        if "final_mask" in args.results:
            identical_mask = IdenticalMask(erro_maps)
            mean_mask = MeanMask(erro_maps)
            var_mask = VarMask(erro_maps)
            final_mask = float8or(mean_mask * identical_mask, var_mask)
            final_mask = final_mask[0].detach().cpu().numpy()
            plt.imsave(dirs["final_mask"] / "{}.png".format(save_name),
                       final_mask,
                       cmap='bone')
def depth_Estimation(args):
    model_name = args.model_name
    #Setting up the network
    print("Loading model....")
    download_model_if_doesnt_exist(model_name)
    encoder_path = os.path.join("models", model_name, "encoder.pth")
    depth_decoder_path = os.path.join("models", model_name, "depth.pth")

    # LOADING PRETRAINED MODEL
    encoder = networks.ResnetEncoder(18, False)
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict_enc = torch.load(encoder_path, map_location='cpu')
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)

    loaded_dict = torch.load(depth_decoder_path, map_location='cpu')
    depth_decoder.load_state_dict(loaded_dict)

    encoder.eval()
    depth_decoder.eval()

    #Loading image
    print("Loading image....")
    image_path = args.image_path
    input_image = pil.open(image_path).convert('RGB')
    original_width, original_height = input_image.size
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    input_image_resized = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)

    input_image_pytorch = transforms.ToTensor()(input_image_resized).unsqueeze(
        0)
    input_npy = input_image_pytorch.squeeze().cpu().numpy()

    #prediction of disparity image
    with torch.no_grad():
        features = encoder(input_image_pytorch)
        outputs = depth_decoder(features)
        disp = outputs[("disp", 0)]

    #Scaling for given resolution
    disp_resized = torch.nn.functional.interpolate(
        disp, (original_height, original_width),
        mode="bilinear",
        align_corners=False
    )  # interpolate the values in to fit the given resolution of the image

    disp_resized_np = disp_resized.squeeze().cpu().numpy(
    )  # Converting tensor in pytorch to numpy array
    print("resized disp" + str(disp_resized_np.shape))
    print("Range of Depth in image")
    scaled, dep = disp_to_depth(
        disp_resized_np, 0.1, 1000)  # resizing the depth from 0.1 to 100 units
    print("min->" + str(dep.min()) + "mx->" + str(dep.max()))
    #Preview of the rgb and Depth images
    rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
    depth = dep.reshape((rgb.shape[0], rgb.shape[1]), order='C')
    plot(rgb, depth)

    return rgb, depth
Exemple #14
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"
        self.STEREO_SCALE_FACTOR = 5.4
        val_biases, val_ranges = get_disparity_class_range(
            self.opt.disparity_class_num, self.opt.min_depth,
            self.opt.max_depth, self.opt.batch_size, self.opt.height,
            self.opt.width)
        self.val_biases = val_biases
        self.val_ranges = val_ranges
        self.disparity_class_num = len(self.val_biases)

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.semanticCoeff = self.opt.semanticCoeff
        self.sfx = nn.Softmax(dim=1)
        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"
        self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc,
            self.opt.scales,
            isSwitch=self.opt.switchMode == "on",
            num_depth_cat=self.disparity_class_num)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        self.set_dataset()
        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.set_layers()
        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                self.train_num, self.val_num))

        self.save_opts()
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80
    height, width = 192, 640

    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    print("-> Loading weights from {}".format(opt.load_weights_folder))

    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path)

    data_images = sorted(os.listdir(os.path.join(opt.data_path, 'image')))

    config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml"
    cfg.merge_from_file(config_file)
    cfg.freeze()

    normalize_transform = transforms.Normalize(mean=cfg.INPUT.PIXEL_MEAN,
                                               std=cfg.INPUT.PIXEL_STD)
    to_bgr_transform = transforms.Lambda(lambda x: x * 255)
    transform_simvodis = transforms.Compose([
        # transforms.ToPILImage(),
        transforms.Resize((height * 2, width * 2)),
        transforms.ToTensor(),
        to_bgr_transform,
        normalize_transform,
    ])

    maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth"
    encoder = networks.ResnetEncoder(cfg, maskrcnn_path)
    depth_decoder = networks.DepthDecoder(scales=opt.scales)

    model_dict = encoder.state_dict()
    encoder.load_state_dict(
        {k: v
         for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(torch.load(decoder_path))

    encoder.cuda()
    encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    pred_disps, depths_gt = [], []

    print("-> Computing predictions with size {}x{}".format(
        encoder_dict['width'], encoder_dict['height']))

    if 'RGBD' in opt.data_path:
        pairing = open(os.path.join(opt.data_path,
                                    'association.txt')).readlines()
        pairing = [item.split()[1:4:2] for item in pairing]
        pairing = {item[0][4:]: item[1][6:] for item in pairing}

    with torch.no_grad():
        for one_image in data_images:
            file_path_img = os.path.join(opt.data_path, 'image', one_image)
            img_mat = pil_loader(file_path_img)
            if '7Scenes' in file_path_img:
                img_mat = img_mat.crop((0, int(
                    (480 - 192) / 2), 640, int((480 + 192) / 2)))
                depth_mat = cv2.imread(
                    os.path.join(opt.data_path, 'depth',
                                 one_image.split('.')[0] + '.depth.png'),
                    cv2.IMREAD_ANYDEPTH)
                depth_mat = depth_mat[int((480 - 192) / 2):int((480 + 192) /
                                                               2), :]
                mask = (depth_mat != 65535)
                depth_mat = (depth_mat * mask) / 1000
                gt_depth = np.expand_dims(depth_mat, axis=0)
                depths_gt.append(gt_depth)
            elif 'Make3D' in file_path_img:
                img_mat = img_mat.crop((0, int(
                    (2272 - 511) / 2), 1704, int((2272 + 511) / 2)))
                mat = scipy.io.loadmat(
                    os.path.join(
                        opt.data_path, "depth",
                        "depth_sph_corr-{}.mat".format(one_image[4:-4])))
                ratio = 4.4
                depth_new_height = 55 / ratio
                gt_depth = mat["Position3DGrid"][:, :, 3][int(
                    (55 - depth_new_height) / 2):int((55 + depth_new_height) /
                                                     2)]
                gt_depth = np.expand_dims(gt_depth, axis=0)
                depths_gt.append(gt_depth)
            elif 'RGBD' in file_path_img:
                img_mat = img_mat.crop((0, int(
                    (480 - 192) / 2), 640, int((480 + 192) / 2)))
                depth_mat = cv2.imread(
                    os.path.join(opt.data_path, 'depth', pairing[one_image]),
                    cv2.IMREAD_ANYDEPTH)
                depth_mat = depth_mat[int((480 - 192) / 2):int((480 + 192) /
                                                               2), :]
                mask = (depth_mat != 65535)
                depth_mat = (depth_mat * mask) / 1000
                gt_depth = np.expand_dims(depth_mat, axis=0)
                depths_gt.append(gt_depth)
            input_color = transform_simvodis(img_mat).cuda()
            input_color = input_color.unsqueeze(0)

            if opt.post_process:
                # Post-processed results require each image to have two forward passes
                input_color = torch.cat(
                    (input_color, torch.flip(input_color, [3])), 0)

            output = depth_decoder(encoder(input_color))

            pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth,
                                         opt.max_depth)
            pred_disp = pred_disp.cpu()[:, 0].numpy()

            if opt.post_process:
                N = pred_disp.shape[0] // 2
                pred_disp = batch_post_process_disparity(
                    pred_disp[:N], pred_disp[N:, :, ::-1])

            pred_disps.append(pred_disp)

    pred_disps = np.concatenate(pred_disps)

    if opt.save_pred_disps:
        output_path = os.path.join(opt.load_weights_folder,
                                   "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    print("-> Evaluating")
    print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []
    gt_depths = np.concatenate(depths_gt)

    for i in range(pred_disps.shape[0]):

        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]

        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp

        mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)

        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]

        pred_depth *= opt.pred_depth_scale_factor
        if not opt.disable_median_scaling:
            ratio = np.median(gt_depth) / np.median(pred_depth)
            ratios.append(ratio)
            pred_depth *= ratio

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH

        errors.append(compute_errors(gt_depth, pred_depth))

    if not opt.disable_median_scaling:
        ratios = np.array(ratios)
        med = np.median(ratios)
        print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(
            med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " +
          ("{:>8} | " * 7
           ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
    print("\n-> Done!")
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(
        num_ch_enc=encoder.num_ch_enc, scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(args.image_path))

    camera_intrinsics_px = [1242*0.58, 375*1.92, 1242*0.5, 375*0.5] # See datasets/kitti_dataset.py
    # TODO: improve loading intrinsics from file ?

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpeg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image_original = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image_original.size
            input_image = input_image_original.resize((feed_width, feed_height), pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="bilinear", align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name))
            scaled_disp, depth = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Save PLY pointcloud from depth map
            depth_resized = torch.nn.functional.interpolate(
                depth, (original_height, original_width), mode="nearest") # !! do not interpolate depth values
            depth_resized_np = depth_resized.cpu().numpy()[0][0]
            nbPts = 0
            plypoints = ""
            for v in range(0, original_height):
                for u in range(0, original_width):
                    d = depth_resized_np[v][u]
                    if d <= 0.0:
                        continue
                    r,g,b = input_image_original.getpixel((u,v))
                    x = d * (float(u) - camera_intrinsics_px[2]) / camera_intrinsics_px[0]
                    y = d * (float(v) - camera_intrinsics_px[3]) / camera_intrinsics_px[1]
                    z = d * 1.0;
                    nbPts += 1
                    plypoints += str(x) + " " + str(y) + " " + str(z) + " " + str(r) + " " + str(g) + " " + str(b) + "\n"
            plyhead = "ply\n"
            plyhead += "format ascii 1.0\n"
            plyhead += "element vertex " + str(nbPts) + "\n"
            plyhead += "property float x\n"
            plyhead += "property float y\n"
            plyhead += "property float z\n"
            plyhead += "property uchar red\n"
            plyhead += "property uchar green\n"
            plyhead += "property uchar blue\n"
            plyhead += "end_header\n"
            filePly = open(os.path.join(output_directory, "{}_disp.ply".format(output_name)), "w+")
            filePly.write(plyhead + plypoints + "\n")
            filePly.close()

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".format(
                idx + 1, len(paths), name_dest_im))

    print('-> Done!')
Exemple #17
0
import torch
from torchvision import transforms

import networks
from utils import download_model_if_doesnt_exist

model_name = "mono_640x192"

download_model_if_doesnt_exist(model_name)
encoder_path = os.path.join("models", model_name, "encoder.pth")
depth_decoder_path = os.path.join("models", model_name, "depth.pth")

# LOADING PRETRAINED MODEL
encoder = networks.ResnetEncoder(18, False)
depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))

loaded_dict_enc = torch.load(encoder_path, map_location='cpu')
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
encoder.load_state_dict(filtered_dict_enc)

loaded_dict = torch.load(depth_decoder_path, map_location='cpu')
depth_decoder.load_state_dict(loaded_dict)

encoder.eval()
depth_decoder.eval();

# image_path = "assets/006656.png"
image_path = "../data_sample/000039.png"
input_image = pil.open(image_path).convert('RGB')
original_width, original_height = input_image.size
Exemple #18
0
def get_pred_disps(opt, split, dataset_choice, tmp_dir_path, out_dir):
    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))
        #filenames = readlines(os.path.join(splits_dir, opt.eval_split, "test_files.txt"))
        train_or_val = {"train": "train_files.txt", "val": "val_files.txt"}
        filenames = sorted(
            readlines(
                os.path.join(
                    splits_dir, opt.eval_split,
                    train_or_val[split])))  # sorted facilitates our life
        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path)
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset,
            "carla": datasets.CarlaDataset,
            "waymo": datasets.WaymoDataset,
            "mixed": datasets.MixedDataset
        }
        dataset = datasets_dict[dataset_choice](opt.data_path,
                                                filenames,
                                                encoder_dict['height'],
                                                encoder_dict['width'], [0],
                                                4,
                                                is_train=False)
        #        dataset = datasets.carla_dataset.CarlaDataset(opt.data_path, filenames,
        #                                           encoder_dict['height'], encoder_dict['width'],
        #                                           [0], 4, is_train=False)
        dataloader = DataLoader(
            dataset,
            1,
            shuffle=False,
            num_workers=opt.num_workers,
            pin_memory=True,
            drop_last=False
        )  # Changed batch from 16 to 1 (before was evaluating only total/16 it seems?)

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        model_dict = encoder.state_dict()
        encoder.load_state_dict(
            {k: v
             for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(torch.load(decoder_path))

        encoder.cuda()
        encoder.eval()
        depth_decoder.cuda()
        depth_decoder.eval()

        pred_disps = []

        print("-> Computing predictions with size {}x{}".format(
            encoder_dict['width'], encoder_dict['height']))

        with torch.no_grad():
            for frame_idx, data in enumerate(dataloader):
                if frame_idx % 100 == 0:
                    print(
                        f"Creating disparity frame {frame_idx}/{len(dataloader)}"
                    )

                input_color = data[("color", 0, 0)].cuda()

                if opt.post_process:
                    # Post-processed results require each image to have two forward passes
                    input_color = torch.cat(
                        (input_color, torch.flip(input_color, [3])), 0)

                output = depth_decoder(encoder(input_color))

                pred_disp, _ = disp_to_depth(output[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(
                        pred_disp[:N], pred_disp[N:, :, ::-1])

                #pred_disps.append(pred_disp)
                frame_name = filenames[frame_idx]
                output_path = os.path.join(tmp_dir_path, f"{frame_name}.npy")
                np.save(output_path, pred_disp)


#        pred_disps = np.concatenate(pred_disps)

#output_path = os.path.join(opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split))
#    print("-> Saving predicted disparities to ", output_path)
    print(f"Saved predicted disparities to temporary dir {tmp_dir_path}")
    disparity_files = [
        os.path.join(tmp_dir_path, x) for x in os.listdir(tmp_dir_path)
    ]
    disparity_files = sorted(disparity_files)
    return disparity_files, filenames
Exemple #19
0
def getMonoDepth(input_image):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    loc = baseLoc + 'monodepth2/'

    model_path = os.path.join(loc + "models", 'mono+stereo_640x192')
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    with torch.no_grad():
        input_image = pil.fromarray(input_image)
        # input_image = pil.open(image_path).convert('RGB')
        original_width, original_height = input_image.size
        input_image = input_image.resize((feed_width, feed_height),
                                         pil.LANCZOS)
        input_image = transforms.ToTensor()(input_image).unsqueeze(0)

        # PREDICTION
        input_image = input_image.to(device)
        features = encoder(input_image)
        outputs = depth_decoder(features)

        disp = outputs[("disp", 0)]
        disp_resized = torch.nn.functional.interpolate(
            disp, (original_height, original_width),
            mode="bilinear",
            align_corners=False)

        # Saving colormapped depth image
        disp_resized_np = disp_resized.squeeze().cpu().numpy()
        vmax = np.percentile(disp_resized_np, 95)
        vmin = disp_resized_np.min()
        disp_resized_np = vmin + (disp_resized_np - vmin) * (vmax - vmin) / (
            disp_resized_np.max() - vmin)
        disp_resized_np = (255 * (disp_resized_np - vmin) /
                           (vmax - vmin)).astype(np.uint8)
        colormapped_im = cv2.applyColorMap(disp_resized_np, cv2.COLORMAP_HOT)
        colormapped_im = cv2.cvtColor(colormapped_im, cv2.COLOR_BGR2RGB)
        # normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
        # mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
        # colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
    return colormapped_im
Exemple #20
0
def test_simple(args):
    """Function to predict for a single image or folder of images"""
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc["height"]
    feed_width = loaded_dict_enc["width"]
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = (os.path.dirname(args.image_path)
                            if not args.dump_path else args.dump_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, "*.{}".format(args.ext)))
        output_directory = args.image_path if not args.dump_path else args.dump_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        mse = 0
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert("RGB")
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp,
                (original_height, original_width),
                mode="bilinear",
                align_corners=False,
            )

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            vmin = disp_resized_np.min()
            normalizer = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap="magma")
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            # Calc error
            correct_file = re.sub(r"\.\w+", "_depth.npy", image_path)
            if os.path.exists(correct_file):
                correct = np.load(correct_file)[:, :, 0]
                disp_np = disp_resized.cpu().detach().numpy()
                disp_np = disp_np[0, 0, :, :]

                correct = ((correct - correct.min()) /
                           (correct.max() - correct.min()) * 255)
                disp_np = ((disp_np - disp_np.min()) /
                           (disp_np.max() - disp_np.min()) * 255)

                mse = mse + ((correct - disp_np)**2).mean()**0.5 / 255

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

    print(f"mse: {mse}")
    print("-> Done!")
Exemple #21
0
def test_cam(args):

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # Extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)
    depth_decoder.to(device)
    depth_decoder.eval()

    print("-> Loading complete, initializing the camera")

    # Initialize camera to capture image stream
    # Change the value to 0 when using default camera
    #video_stream = WebcamVideoStream(src=args.webcam).start()

    if not args.no_display:
        # Object to display images
        image_display = DisplayImage(not args.no_process)

    # Flag that records when 'q' is pressed to break out of inference loop below
    quit_inference = False

    def on_release(key):
        if key == keyboard.KeyCode.from_char('q'):
            nonlocal quit_inference
            quit_inference = True
            #s.close()
            return False

    keyboard.Listener(on_release=on_release).start()

    # Number of frames to capture to calculate fps
    num_frames = 5
    curr_time = np.zeros(num_frames)

    with torch.no_grad():
        print("Loop has started")
        host = "0.0.0.0"
        port = 5015
        s = socket.socket()
        try:
            s.bind((host, port))
        except socket.error as e:
            print(str(e))
        print("Socket setup")
        connected = True
        bufferSize = 8192
        #c, addr = s.accept()
        #print("Connected to :", addr[0], ":",addr[1])
        first_loop = True
        connection_ready = False
        while True:
            if quit_inference:
                if args.no_display:
                    print('-> Done')
                break

            if first_loop:
                frame = cv2.imread('assets/test_image.jpg')
                print("Read test image")
                first_loop = False
            elif not connection_ready:
                s.listen(10)
                c, addr = s.accept()
                print("Connected to: ", addr[0], ":", addr[1])
                connection_ready = True
                continue
            else:
                try:

                    data = c.recv(11)
                    print("data as a string: " + str(data))
                    if (str(data).startswith('b\'SIZE')):
                        tmp = str(data).split()
                        bufferSize = int(tmp[1][:-1])
                        print("tmp[1] :" + str(tmp[1]))

                        c.sendall("yes".encode())
                        data = bytearray(c.recv(bufferSize))
                        print(data)
                    #else:
                    #   data = bytearray(data) + bytearray(c.recv(bufferSize))

                    #data = bytearray(c.recv(bufferSize))
                    print("Data")
                    print(data)
                    frame_np = np.asarray(data, dtype=np.uint8)
                    print("frame_np")
                    print(frame_np)
                    frame = cv2.imdecode(frame_np, cv2.IMREAD_COLOR)
                    print("frame")
                    print(frame)
                # print(frame.shape)
                except socket.error as e:
                    connected = False
                    print("Connection lost, reconnecting")
                    while not connected:
                        try:
                            c.bind(("0.0.0.0", port))
                            c.listen()
                            c.accept()
                            print("Reconnection worked")
                            connected = True
                        except socket.error as e:
                            print(e)

            # Capture frame-by-frame
            #frame = video_stream.read()
        # frame = np.asarray(data, dtype =np.uint8)
        #PUT IN THE ACTUAL IMAGE RETRIEVAL HERE

        #print (type(frame))
        # Calculate the fps
            print("Got frame")
            curr_time[1:] = curr_time[:-1]
            curr_time[0] = time.time()
            fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1])

            # Our operations on the frame come here
            #                input_image = pil.fromarray(frame).convert('RGB')
            #fh = open("testfile.jpg","wb")
            #fh.write(data)
            #fh.close()
            input_image = pil.fromarray(frame).convert('RGB')
            #          img = pil.open(fh)
            #           img.save(data, format ='jpg')
            #               print("type: "+ type(img))
            # input_image = pil.frombytes('RGB', len(data), data, 'raw')
            #input_image = pil.fromarray(data).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            print("Prediction starting")
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="nearest")

            # Get the predict depth
            scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100)
            pred_depth_np = pred_depth.squeeze().cpu().detach().numpy()

            # Initialize a 3x4 depth map
            depth_map = np.zeros([3, 4])
            grid_width = original_width // 4
            grid_height = original_height // 3
            for i in range(len(depth_map)):
                for j in range(len(depth_map[0])):
                    # Cut and store the average value of depth information of 640x480 into 3x4 grid
                    depth_map[i][j] = get_avg_depth(pred_depth_np,
                                                    grid_width * i,
                                                    grid_height * j,
                                                    grid_width * (i + 1),
                                                    grid_height * (j + 1))

            # Giving a simple decision logic
            if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[
                    0, 2] <= 1 or depth_map[1, 2] <= 1:
                if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1:
                    print("Dangerous!!! AHEAD")
                else:
                    if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1:
                        print("Dangerous!!! LEFT")
                    if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1:
                        print("Dangerous!!! RIGHT")
            elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum(
                    depth_map[0:2, 2:3]) <= 7:
                if np.sum(depth_map[0:2, 0:1]) <= 7:
                    print("Careful!! LEFT")
                if np.sum(depth_map[0:2, 2:3]) <= 7:
                    print("Careful!! RIGHT")
            else:
                print("Clear")

            if not args.no_display:
                # DISPLAY
                # Generate color-mapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().detach().numpy()
                image_display.display(frame,
                                      disp_resized_np,
                                      fps,
                                      original_width,
                                      original_height,
                                      blended=not args.no_blend)
            else:
                print(f"FPS: {fps}")

            # if quit_inference:
            #    if args.no_display:
            #        print('-> Done')
            #    break

    # When everything is done, stop camera stream
    video_stream.stop()
Exemple #22
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cpu")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            _, depth = disp_to_depth(disp_resized, 0.1, 100)
            depth_to_3d = BackprojectDepth(1, original_height, original_width)
            K = np.array([[[879.03824732, 0, 613.17597314, 0],
                           [0, 879.03824732, 524.14407205, 0], [0, 0, 1, 0],
                           [0, 0, 0, 1]]],
                         dtype=np.float32)
            K[:2, :] = K[:2, :] / 4
            inv_K = np.linalg.pinv(K)
            inv_K = torch.from_numpy(inv_K)
            pointclouds = depth_to_3d(depth, inv_K)
            points_to_TV = ProjectTV(1, original_width, original_height,
                                     original_width, original_height, 2)
            top_view = points_to_TV(pointclouds)
            print(top_view.shape)
            print(top_view)
            #print(np.nonzero(top_view))
            import matplotlib.pyplot as plt
            plt.imshow(top_view[0].T)
            plt.savefig('foo1.png')
            #plt.show()
            #save_topview(top_view, 'tv_test')

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

    print('-> Done!')
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80
    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))

        filenames = readlines(
            os.path.join(splits_dir, opt.eval_split, "test_files.txt"))

        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path)

        dataset = datasets.KITTIRAWDataset(opt.data_path,
                                           filenames,
                                           encoder_dict['height'],
                                           encoder_dict['width'], [0],
                                           4,
                                           is_train=False,
                                           load_semantics=opt.load_semantics,
                                           seman_path=opt.seman_path)

        dataloader = DataLoader(dataset,
                                opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                drop_last=False)

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        if opt.bnMorphLoss:
            from bnmorph.bnmorph import BNMorph
            bnmorph = BNMorph(height=encoder_dict['height'],
                              width=encoder_dict['width']).cuda()
            if opt.post_process:
                tool = grad_computation_tools(
                    batch_size=opt.batch_size * 2,
                    height=encoder_dict['height'],
                    width=encoder_dict['width']).cuda()
            else:
                tool = grad_computation_tools(
                    batch_size=opt.batch_size,
                    height=encoder_dict['height'],
                    width=encoder_dict['width']).cuda()

        model_dict = encoder.state_dict()
        encoder.load_state_dict(
            {k: v
             for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(torch.load(decoder_path))

        encoder.cuda()
        encoder.eval()
        depth_decoder.cuda()
        depth_decoder.eval()

        pred_disps = []
        count = 0
        with torch.no_grad():
            for data in dataloader:
                input_color = data[("color", 0, 0)].cuda()
                if opt.post_process:
                    input_color = torch.cat(
                        (input_color, torch.flip(input_color, [3])), 0)
                    if 'seman_gt' in data:
                        data['seman_gt'] = torch.cat(
                            (data['seman_gt'], torch.flip(
                                data['seman_gt'], [3])), 0)

                features = encoder(input_color)
                outputs = dict()
                outputs.update(depth_decoder(features))

                if opt.bnMorphLoss:
                    for key, ipt in data.items():
                        if not (key == 'height' or key == 'width'
                                or key == 'tag' or key == 'cts_meta'
                                or key == 'file_add'):
                            data[key] = ipt.to(torch.device("cuda"))

                    disparity_grad_bin = tool.get_disparityEdge(outputs['disp',
                                                                        0])
                    semantics_grad_bin = tool.get_semanticsEdge(
                        data['seman_gt'])

                    morphedx, morphedy, coeff = bnmorph.find_corresponding_pts(
                        disparity_grad_bin, semantics_grad_bin)
                    morphedx = (morphedx /
                                (encoder_dict['width'] - 1) - 0.5) * 2
                    morphedy = (morphedy /
                                (encoder_dict['height'] - 1) - 0.5) * 2
                    grid = torch.cat([morphedx, morphedy],
                                     dim=1).permute(0, 2, 3, 1)
                    dispMaps_morphed = F.grid_sample(outputs['disp', 0],
                                                     grid,
                                                     padding_mode="border")
                    outputs[("disp", 0)] = dispMaps_morphed

                count = count + 1
                pred_disp, _ = disp_to_depth(outputs[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(
                        pred_disp[:N], pred_disp[N:, :, ::-1])
                pred_disps.append(pred_disp)

        pred_disps = np.concatenate(pred_disps)
    else:
        # Load predictions from file
        print("-> Loading predictions from {}".format(opt.ext_disp_to_eval))
        pred_disps = np.load(opt.ext_disp_to_eval)

        if opt.eval_eigen_to_benchmark:
            eigen_to_benchmark_ids = np.load(
                os.path.join(splits_dir, "benchmark",
                             "eigen_to_benchmark_ids.npy"))

            pred_disps = pred_disps[eigen_to_benchmark_ids]

    if opt.save_pred_disps:
        output_path = os.path.join(opt.load_weights_folder,
                                   "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    elif opt.eval_split == 'benchmark':
        save_dir = os.path.join(opt.load_weights_folder,
                                "benchmark_predictions")
        print("-> Saving out benchmark predictions to {}".format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx in range(len(pred_disps)):
            disp_resized = cv2.resize(pred_disps[idx], (1216, 352))
            depth = STEREO_SCALE_FACTOR / disp_resized
            depth = np.clip(depth, 0, 80)
            depth = np.uint16(depth * 256)
            save_path = os.path.join(save_dir, "{:010d}.png".format(idx))
            cv2.imwrite(save_path, depth)

        print(
            "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done."
        )
        quit()

    gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz")
    gt_depths = np.load(gt_path,
                        fix_imports=True,
                        encoding='latin1',
                        allow_pickle=True)["data"]

    print("-> Evaluating")

    if opt.eval_stereo:
        print("   Stereo evaluation - "
              "disabling median scaling, scaling by {}".format(
                  STEREO_SCALE_FACTOR))
        opt.disable_median_scaling = True
        opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR
    else:
        print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []

    for i in range(pred_disps.shape[0]):

        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]

        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp

        if opt.eval_split == "eigen" or opt.UseCustTest:
            mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)

            crop = np.array([
                0.40810811 * gt_height, 0.99189189 * gt_height,
                0.03594771 * gt_width, 0.96405229 * gt_width
            ]).astype(np.int32)
            crop_mask = np.zeros(mask.shape)
            crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
            mask = np.logical_and(mask, crop_mask)
        else:
            mask = gt_depth > 0

        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]

        pred_depth *= opt.pred_depth_scale_factor
        if not opt.disable_median_scaling:
            ratio = np.median(gt_depth) / np.median(pred_depth)
            ratios.append(ratio)
            pred_depth *= ratio

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH
        errors.append(
            compute_errors(
                gt_depth,
                pred_depth,
                UseGtMedianScaling=(opt.UseGtMedianScaling == True)))

    if not opt.disable_median_scaling:
        ratios = np.array(ratios)
        med = np.median(ratios)
        print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(
            med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " +
          ("{:>8} | " * 7
           ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
    print("\n-> Done!")
Exemple #24
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            print("original width: ", original_width, " original height: ",
                  original_height)
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)
            print(type(outputs))
            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)
            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            theList = list(disp_resized_np)
            count = 0
            # n = width, k = height
            # n lists of size k
            # origin is at bottom left

            labels, pts = get_clusters(theList)
            ptsOfInterest = []
            backgroundPts = []
            index = 0
            for label in labels:
                if label == 1:
                    backgroundPts.append(pts[index])
                else:
                    ptsOfInterest.append(pts[index])
                index += 1
            total = 0
            count = 0

            for pt in ptsOfInterest:
                count += 1
                total += pt

            label0Distance = total / count

            total = 0
            count = 0
            for pt in backgroundPts:

                count += 1
                total += pt
            label1Distance = total / count
            print("Distance to object:", label0Distance * scalingFactor,
                  "meters")
            print("Distance to background:", label1Distance * scalingFactor,
                  "meters")

            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)
Exemple #25
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    K = np.array(
        [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)

    assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \
        "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo"

    if opt.ext_disp_to_eval is None:

        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

        assert os.path.isdir(opt.load_weights_folder), \
            "Cannot find a folder at {}".format(opt.load_weights_folder)

        print("-> Loading weights from {}".format(opt.load_weights_folder))

        filenames = readlines(
            os.path.join(splits_dir, opt.eval_split, "test_files.txt"))
        encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

        encoder_dict = torch.load(encoder_path)

        img_ext = '.png' if opt.png else '.jpg'
        dataset = datasets.KITTIRAWDataset(opt.data_path,
                                           filenames,
                                           encoder_dict['height'],
                                           encoder_dict['width'], [0],
                                           4,
                                           is_train=False,
                                           img_ext=img_ext)
        dataloader = DataLoader(dataset,
                                16,
                                shuffle=False,
                                num_workers=opt.num_workers,
                                pin_memory=True,
                                drop_last=False)

        encoder = networks.ResnetEncoder(opt.num_layers, False)
        depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

        model_dict = encoder.state_dict()
        encoder.load_state_dict(
            {k: v
             for k, v in encoder_dict.items() if k in model_dict})
        depth_decoder.load_state_dict(torch.load(decoder_path))

        encoder.cuda()
        encoder.eval()
        depth_decoder.cuda()
        depth_decoder.eval()

        pred_disps = []

        print("-> Computing predictions with size {}x{}".format(
            encoder_dict['width'], encoder_dict['height']))

        with torch.no_grad():
            for data in dataloader:
                input_color = data[("color", 0, 0)].cuda()

                if opt.post_process:
                    # Post-processed results require each image to have two forward passes
                    input_color = torch.cat(
                        (input_color, torch.flip(input_color, [3])), 0)

                output = depth_decoder(encoder(input_color))

                pred_disp, _ = disp_to_depth(output[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                if opt.post_process:
                    N = pred_disp.shape[0] // 2
                    pred_disp = batch_post_process_disparity(
                        pred_disp[:N], pred_disp[N:, :, ::-1])

                pred_disps.append(pred_disp)

        pred_disps = np.concatenate(pred_disps)

    else:
        # Load predictions from file
        print("-> Loading predictions from {}".format(opt.ext_disp_to_eval))
        pred_disps = np.load(opt.ext_disp_to_eval)

        if opt.eval_eigen_to_benchmark:
            eigen_to_benchmark_ids = np.load(
                os.path.join(splits_dir, "benchmark",
                             "eigen_to_benchmark_ids.npy"))

            pred_disps = pred_disps[eigen_to_benchmark_ids]

    if opt.eval_object:
        object_masks = []
        for line in filenames:
            line = line.split()
            folder, frame_index = line[0], int(line[1])

            object_mask_filename = os.path.join(
                os.path.dirname(__file__), "object_masks", folder,
                "{:010d}.npy".format(int(frame_index)))
            object_mask = np.load(object_mask_filename)
            object_masks.append(object_mask)

    if opt.save_pred_disps:
        output_path = os.path.join(opt.load_weights_folder,
                                   "disps_{}_split.npy".format(opt.eval_split))
        print("-> Saving predicted disparities to ", output_path)
        np.save(output_path, pred_disps)

    if opt.no_eval:
        print("-> Evaluation disabled. Done.")
        quit()

    elif opt.eval_split == 'benchmark':
        save_dir = os.path.join(opt.load_weights_folder,
                                "benchmark_predictions")
        print("-> Saving out benchmark predictions to {}".format(save_dir))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for idx in range(len(pred_disps)):
            disp_resized = cv2.resize(pred_disps[idx], (1216, 352))
            depth = STEREO_SCALE_FACTOR / disp_resized
            depth = np.clip(depth, 0, 80)
            depth = np.uint16(depth * 256)
            save_path = os.path.join(save_dir, "{:010d}.png".format(idx))
            cv2.imwrite(save_path, depth)

        print(
            "-> No ground truth is available for the KITTI benchmark, so not evaluating. Done."
        )
        quit()

    gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz")
    gt_depths = np.load(gt_path,
                        fix_imports=True,
                        encoding='latin1',
                        allow_pickle=True)["data"]

    print("-> Evaluating")

    if opt.eval_stereo:
        print("   Stereo evaluation - "
              "disabling median scaling, scaling by {}".format(
                  STEREO_SCALE_FACTOR))
        opt.scaling = "disable"
        opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR
    else:
        print("   Mono evaluation - using median scaling")

    errors = []
    ratios = []

    for i in range(pred_disps.shape[0]):
        gt_depth = gt_depths[i]
        gt_height, gt_width = gt_depth.shape[:2]

        pred_disp = pred_disps[i]
        pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
        pred_depth = 1 / pred_disp

        if opt.eval_split == "eigen":
            mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH)

            crop = np.array([
                0.40810811 * gt_height, 0.99189189 * gt_height,
                0.03594771 * gt_width, 0.96405229 * gt_width
            ]).astype(np.int32)
            crop_mask = np.zeros(mask.shape)
            crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
            mask = np.logical_and(mask, crop_mask)
            if opt.eval_object:
                object_mask = object_masks[i].astype(np.bool)

        else:
            mask = gt_depth > 0

        if opt.scaling == "gt":
            ratio = np.median(gt_depth[mask]) / np.median(pred_depth[mask])
            if opt.eval_object:
                mask = np.logical_and(mask, object_mask)
        elif opt.scaling == "dgc":
            tensor_K = K.copy()
            tensor_K[0, :] *= gt_width
            tensor_K[1, :] *= gt_height
            tensor_K = torch.from_numpy(tensor_K).unsqueeze(0).cuda()

            cam_height = torch.tensor([opt.cam_height]).cuda()

            scale_recovery = ScaleRecovery(1, gt_height, gt_width).cuda()
            pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
            ratio = scale_recovery(pred_depth, tensor_K,
                                   cam_height).cpu().item()
            pred_depth = pred_depth[0].cpu().numpy()
        else:
            ratio = 1

        pred_depth = pred_depth[mask]
        gt_depth = gt_depth[mask]

        pred_depth *= ratio
        ratios.append(ratio)

        pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH
        pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH

        if len(gt_depth) != 0:
            errors.append(compute_errors(gt_depth, pred_depth))

    ratios = np.array(ratios)
    med = np.median(ratios)
    print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(
        med, np.std(ratios / med)))

    mean_errors = np.array(errors).mean(0)

    print("\n  " +
          ("{:>8} | " * 7
           ).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
    print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
    print("\n-> Done!")
Exemple #26
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        #  默认大小为640×192
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        # "scales used in the loss"
        self.num_scales = len(self.opt.scales)

        # 默认[0, -1, 1], target 对应id为0
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        # self.opt.num_layers为encoder部分resnet的深度,默认使用ResNet-18
        # 输出5个尺度的features
        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        # 三种posenet的处理办法,在论文中的Supplementary Material的Table中有对比结果,
        # 从表中的结果来看,separate_resnet效果最好,默认选取separate_resnet
        if self.use_pose_net:
            # 和depth encoder不共享参数
            # pose encoder部分将两张图像在通道维度堆叠为6个通道,输出一个features
            # pose decoder部分输入一个features,输出两个pose
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            # 和depth encoder共享参数
            # encoder部分分别输入一张图像(类似孪生网络)
            # decoder部分输入两个features,输出一个pose
            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            # posecnn为 Learning Depth from Monocular Videos using Direct Methods 中提出的方法,
            # 参考https://arxiv.org/pdf/1712.00175.pdf
            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        # 这个mask对应的是sfmlearner的mask
        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        # if set, disables ssim in the loss
        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        # save options
        self.save_opts()
Exemple #27
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset,
            "kitti_depth": datasets.KITTIDepthDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files_p.txt")
        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()
Exemple #28
0
def test_simple(model_name, paths, val_iter_list, batch_it_num,
                backproject_depth_l, project_3d_l, sv_path_l):
    """Function to predict for a single image or folder of images
    """
    device = torch.device("cuda")
    model_path = model_name
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(paths, model_path, "encoder.pth")
    depth_decoder_path = os.path.join(paths, model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    # feed_height = loaded_dict_enc['height']
    # feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    print("-> Predicting on test images")

    # PREDICTING ON EACH IMAGE IN TURN
    disp_resized_np_list = list()
    source_scale = 0
    with torch.no_grad():
        for count, val_iter in enumerate(val_iter_list):
            backproject_depth = backproject_depth_l[count]
            project_3d = project_3d_l[count]
            svcount = 0
            for k in range(batch_it_num[count]):
                try:
                    inputs = val_iter.next()
                except StopIteration:
                    print("Finish iterating all available data")
                    break
                T = inputs["stereo_T"].cuda()
                input_rgb = inputs[('color', 0, 0)].cuda()
                sample_rgb = inputs[('color', 's', 0)].cuda()
                features = encoder(input_rgb)
                outputs = depth_decoder(features)
                disp = outputs[("disp", 0)]
                _, depth = disp_to_depth(disp, 0.1, 100)

                cam_points = backproject_depth(
                    depth, inputs[("inv_K", source_scale)].cuda())
                pix_coords = project_3d(cam_points,
                                        inputs[("K", source_scale)].cuda(), T)
                reconstructed_rgb = F.grid_sample(sample_rgb,
                                                  pix_coords,
                                                  padding_mode="border")
                reconstructed_rgb = reconstructed_rgb.permute(0, 2, 3, 1).cpu()
                for picind in range(reconstructed_rgb.shape[0]):
                    c_sv_path = os.path.join(sv_path_l[count],
                                             str(svcount) + ".png")
                    img1 = inputs[('color', 's',
                                   0)].permute(0, 2, 3,
                                               1)[picind, :, :, :].numpy()
                    img2 = reconstructed_rgb[picind, :, :, :].numpy()
                    img3 = inputs[('color', 0,
                                   0)].permute(0, 2, 3,
                                               1)[picind, :, :, :].numpy()
                    combined_img = np.concatenate((img1, img2, img3), axis=0)
                    Image.fromarray(
                        (combined_img * 255).astype(np.uint8)).save(c_sv_path)
                    svcount = svcount + 1
                print("finish %dth dataset %dth batch" % (count, k))
Exemple #29
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    if args.pred_metric_depth and "stereo" not in args.model_name:
        print(
            "Warning: The --pred_metric_depth flag only makes sense for stereo-trained KITTI "
            "models. For mono-trained models, output depths will not in metric space."
        )

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            scaled_disp, depth = disp_to_depth(disp, 0.1, 100)
            if args.pred_metric_depth:
                name_dest_npy = os.path.join(
                    output_directory, "{}_depth.npy".format(output_name))
                metric_depth = STEREO_SCALE_FACTOR * depth.cpu().numpy()
                np.save(name_dest_npy, metric_depth)
            else:
                name_dest_npy = os.path.join(output_directory,
                                             "{}_disp.npy".format(output_name))
                np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved predictions to:".
                  format(idx + 1, len(paths)))
            print("   - {}".format(name_dest_im))
            print("   - {}".format(name_dest_npy))

    print('-> Done!')
Exemple #30
0
    def __init__(self, options):
        self.opt = options
        self.seed_everything()

        # create dirs for logs and predictions if do not exist
        self.log_path = self.opt.log_dir
        if not os.path.exists(self.log_path):
            os.mkdir(self.log_path)
        preds_dir = os.path.join(self.log_path, "preds")
        if not os.path.exists(preds_dir):
            os.mkdir(preds_dir)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        # we don't expect anyone running this on cpu..
        self.device = torch.device("cuda")

        # model initialization
        self.models = {}
        self.parameters_to_train = []

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, True)
        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["pose_encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, True, num_input_images=self.num_input_frames)
        self.models["pose"] = networks.PoseDecoder(
            self.models["pose_encoder"].num_ch_enc,
            num_input_features=1,
            num_frames_to_predict_for=2)

        for _, m in self.models.items():
            m.to(self.device)
            self.parameters_to_train += list(m.parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)

        self.ssim = SSIM()
        self.ssim.to(self.device)

        self.backproject_depth = BackprojectDepth(
            self.opt.batch_size * self.num_scales, self.opt.height,
            self.opt.width)
        self.backproject_depth.to(self.device)

        self.project_3d = Project3D(
            self.opt.batch_size * (self.num_input_frames - 1) *
            self.num_scales, self.opt.height, self.opt.width)
        self.project_3d.to(self.device)

        # save adaptation parameters to the log dir
        self.save_opts()