Example #1
0
def prepare_model_for_test(opt):
    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)
    print("-> Loading weights from {}".format(opt.load_weights_folder))
    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)

    pose_encoder.load_state_dict(torch.load(pose_encoder_path))
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda().eval()
    pose_decoder.cuda().eval()

    return pose_encoder, pose_decoder
Example #2
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        #  默认大小为640×192
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        # "scales used in the loss"
        self.num_scales = len(self.opt.scales)

        # 默认[0, -1, 1], target 对应id为0
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        # self.opt.num_layers为encoder部分resnet的深度,默认使用ResNet-18
        # 输出5个尺度的features
        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        # 三种posenet的处理办法,在论文中的Supplementary Material的Table中有对比结果,
        # 从表中的结果来看,separate_resnet效果最好,默认选取separate_resnet
        if self.use_pose_net:
            # 和depth encoder不共享参数
            # pose encoder部分将两张图像在通道维度堆叠为6个通道,输出一个features
            # pose decoder部分输入一个features,输出两个pose
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            # 和depth encoder共享参数
            # encoder部分分别输入一张图像(类似孪生网络)
            # decoder部分输入两个features,输出一个pose
            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            # posecnn为 Learning Depth from Monocular Videos using Direct Methods 中提出的方法,
            # 参考https://arxiv.org/pdf/1712.00175.pdf
            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        # 这个mask对应的是sfmlearner的mask
        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        # if set, disables ssim in the loss
        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        # save options
        self.save_opts()
Example #3
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(
        num_ch_enc=encoder.num_ch_enc, scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    print("Loading pose networks")
    pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
    pose_decoder_path = os.path.join(model_path, "pose.pth")

    pose_encoder = networks.ResnetEncoder(18, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()
    

    bag_name = '2019-12-17-13-24-03'
    map_name = "feature=base&ver=2019121700&base_pt=(32.75707,-111.55757)&end_pt=(32.092537212,-110.7892506)"
    begin = '0:36:00'
    end = '0:37:00'
    output_directory = "assets/"

    dataset = TSDataset(bag_name, begin, end)
    pred_depth = []
    pred_poses = []
    last_img = None

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, input_image in enumerate(dataset):

            # Load image and preprocess
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="bilinear", align_corners=False)

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)
            pred_depth.append(im)

            # Handle pose
            if last_img is None:
                last_img = input_image
            all_color_aug = torch.cat([last_img, input_image], 1)
            last_img = input_image

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)
            pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()
            pred_poses.append(pose)
            
            print("   Processed {:d} of {:d} images".format(
                idx + 1, len(dataset)))
    pred_poses = np.concatenate(pred_poses, axis=0)
    print(pred_poses.shape)
    np.save("poses.npy", pred_poses)

    # save_video(pred_depth)

    print('-> Done!')
Example #4
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        self.dataset = datasets.InteriorDataset

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()
Example #5
0
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    sequence_id = int(opt.eval_split.split("_")[1])
    opt.batch_size = 1

    filenames = readlines(
        os.path.join(os.path.dirname(__file__), "splits", "odom",
                     "test_files_{:02d}.txt".format(sequence_id)))

    dataset = KITTIOdomDataset(opt.data_path,
                               filenames,
                               opt.height,
                               opt.width, [0, -1, 1],
                               4,
                               1,
                               is_train=False,
                               img_ext='.png')
    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    # pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml"
    cfg.merge_from_file(config_file)
    cfg.freeze()
    maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth"
    pose_encoder = networks.ResnetEncoder(cfg, maskrcnn_path)
    # pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    # pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(len(opt.frame_ids))
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()

    pred_poses = []

    print("-> Computing pose predictions")

    # opt.frame_ids = [0, 1]  # pose network only takes two frames as input
    ii = 0
    with torch.no_grad():
        for inputs in dataloader:
            for key, ipt in inputs.items():
                if isinstance(ipt, torch.Tensor):
                    inputs[key] = ipt.cuda()

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids])

            all_features = pose_encoder(all_color_aug)
            all_features = [
                torch.split(f, opt.batch_size) for f in all_features
            ]

            features = {}
            for i, k in enumerate(opt.frame_ids):
                features[k] = [f[i] for f in all_features]
            pose_inputs = [features[i] for i in opt.frame_ids if i != "s"]

            axisangle, translation = pose_decoder(pose_inputs)
            if ii == 0:
                pred_poses.append(
                    transformation_from_parameters(axisangle[:, 0],
                                                   translation[:, 0],
                                                   True).cpu().numpy())
            pred_poses.append(
                transformation_from_parameters(axisangle[:, 1],
                                               translation[:,
                                                           1]).cpu().numpy())
            if ii % opt.log_frequency == 0:
                print("{:04d}-th image processing".format(ii))
            ii += 1
        # pred_poses.append(
        #     transformation_from_parameters(axisangle[:, 1], translation[:, 1]).cpu().numpy())

    pred_poses = np.concatenate(pred_poses)

    gt_poses_path = os.path.join(
        "/usr/stud/linp/storage/user/linp/results/kitti", "poses",
        "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape((-1, 3, 4))
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1
    gt_xyzs = gt_global_poses[:, :3, 3]

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.linalg.inv(
                np.dot(np.linalg.inv(gt_global_poses[i - 1]),
                       gt_global_poses[i])))

    ates = []
    num_frames = gt_xyzs.shape[0]
    track_length = 3

    for i in range(0, num_frames - 1):
        local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1]))
        gt_local_xyzs = np.array(
            dump_xyz(gt_local_poses[i:i + track_length - 1]))

        ates.append(compute_ate(gt_local_xyzs, local_xyzs))
    '''
    for i in range(0, num_frames - 2):
        local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1]))
        gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i + 1:i + track_length]))

        ates.append(compute_ate(gt_local_xyzs, local_xyzs))
    '''

    print("\n   Trajectory error: {:0.3f}, std: {:0.3f}\n".format(
        np.mean(ates), np.std(ates)))

    save_path = os.path.join(opt.load_weights_folder, "poses.npy")
    np.save(save_path, pred_poses)
    print("-> Predictions saved to", save_path)
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    # Depth
    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path)
    encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

    model_dict = encoder.state_dict()
    encoder.load_state_dict(
        {k: v
         for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(torch.load(decoder_path))

    encoder.cuda()
    encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    # Pose
    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()

    if opt.use_imu:
        imu_lstm = nn.LSTM(6, opt.lstm_hidden_size, opt.lstm_num_layers)
        imu_lstm.cuda()
        imu_lstm.eval()
        lstm_hs = None

        hidden_to_imu = torch.nn.Sequential(
            torch.nn.Linear(opt.lstm_hidden_size, 6), )
        hidden_to_imu.cuda()
        hidden_to_imu.eval()

        if opt.pose_fuse:
            pose_fuse_mlp = torch.nn.Sequential(
                torch.nn.Linear(24, opt.pose_mlp_hidden_size),
                torch.nn.Sigmoid(),
                torch.nn.Linear(opt.pose_mlp_hidden_size, 6),
            )
            pose_fuse_mlp.cuda()
            pose_fuse_mlp.eval()

    img_ext = '.png' if opt.png else '.jpg'

    pred_disps = []
    scale_factors = []

    kitty_odom = False
    if opt.eval_split.startswith("odom"):
        kitty_odom = True

    if kitty_odom:
        ids = [int(opt.eval_split.split("_")[1])]
    else:
        splits_dir = os.path.join(os.path.dirname(__file__), "splits")
        videonames = readlines(
            os.path.join(splits_dir, opt.eval_split, "test_video_list.txt"))
        ids = videonames

    for videoname in ids:
        if kitty_odom:
            filenames = readlines(
                os.path.join(splits_dir, opt.eval_split,
                             "test_files_{:02d}.txt".format(videoname)))
        else:
            filenames = readlines(
                os.path.join(splits_dir, opt.eval_split, "test_files.txt"))
        if kitty_odom:

            dataset = KITTIOdomDataset(opt.data_path,
                                       filenames,
                                       opt.height,
                                       opt.width, [0, 1],
                                       4,
                                       is_train=False,
                                       use_imu=False)
            dataloader = DataLoader(dataset,
                                    opt.batch_size,
                                    shuffle=False,
                                    num_workers=opt.num_workers,
                                    pin_memory=True,
                                    drop_last=False)
        else:
            if opt.use_imu:
                dataset = SequenceRawKittiDataset(
                    opt.data_path, [videoname],
                    filenames,
                    1,
                    imu_data_path=opt.imu_data_path,
                    img_ext=img_ext,
                    frame_idxs=[0, 1],
                    height=encoder_dict['height'],
                    width=encoder_dict['width'],
                    num_scales=4,
                    is_train=False)
                dataloader = DataLoader(dataset, shuffle=False, num_workers=0)
            else:
                filenames = list(
                    filter(lambda f: f.startswith(videoname), filenames))
                dataset = KITTIRAWDataset(opt.data_path,
                                          filenames,
                                          opt.height,
                                          opt.width, [0, 1],
                                          4,
                                          is_train=False,
                                          use_imu=False)
                dataloader = DataLoader(dataset,
                                        opt.batch_size,
                                        shuffle=False,
                                        num_workers=opt.num_workers,
                                        pin_memory=True,
                                        drop_last=False)
        # pred_poses = [np.eye(4).reshape(1, 4, 4)]
        pred_poses = []
        imu_scale_factors = []

        print("EVALUATING ", opt.model_name)

        print("-> Computing pose predictions")

        opt.frame_ids = [0, 1]  # pose network only takes two frames as input

        with torch.no_grad():
            for inputs in dataloader:
                for key, ipt in inputs.items():
                    inputs[key] = ipt.cuda()
                    if opt.use_imu:
                        inputs[key] = inputs[key].squeeze(0)
                input_color = inputs[("color", 0, 0)]
                feature = encoder(input_color)
                output = depth_decoder(feature)

                pred_disp, _ = disp_to_depth(output[("disp", 0)],
                                             opt.min_depth, opt.max_depth)
                pred_disp = pred_disp.cpu()[:, 0].numpy()

                pred_disps.append(pred_disp)

                all_color_aug = torch.cat([
                    inputs[("color_aug", i, 0)] for i in sorted(opt.frame_ids)
                ], 1)

                features = [pose_encoder(all_color_aug)]
                axisangle, translation = pose_decoder(features)
                outputs = {}
                outputs[("cam_T_cam", 0,
                         1)] = transformation_from_parameters(axisangle[:, 0],
                                                              translation[:,
                                                                          0],
                                                              invert=False)

                T = outputs[("cam_T_cam", 0, 1)]
                if opt.use_imu:
                    outputs = predict_poses_from_imu2(opt, inputs, imu_lstm,
                                                      lstm_hs, hidden_to_imu)
                    T_better = outputs[("cam_T_cam_imu", 0, 1)]
                    if opt.pose_fuse:
                        fuse_poses(opt, outputs, pose_fuse_mlp)
                        T_better = outputs[("cam_T_cam_fuse", 0, 1)]

                    R, t = rot_translation_from_transformation(T)
                    Rb, tb = rot_translation_from_transformation(T_better)
                    imu_scale_factor = torch.sum(tb * t) / torch.sum(t**2)

                    imu_scale_factors.append(imu_scale_factor.cpu().numpy())
                    # scale_factors.append(imu_scale_factors)

                    T = T_better

                pred_poses.append(T.cpu().numpy())

            pred_poses = np.concatenate(pred_poses)

            if opt.eval_split.startswith("odom"):
                gt_poses_path = os.path.join(opt.data_path, "poses",
                                             "{:02d}.txt".format(videoname))
            else:
                gt_poses_path = os.path.join(opt.data_path, videoname, "oxts",
                                             "poses.txt")

            eval_pose(opt, pred_poses, gt_poses_path)
        scale_factors = {}
        if imu_scale_factors:
            scale_factors["IMU factor"] = imu_scale_factors
    pred_disps = np.concatenate(pred_disps)
    if not kitty_odom:
        eval_depth(opt, pred_disps, scale_factors)
Example #7
0
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """

    conv_layer, data_lambda, intrinsics = get_params(opt)
    configs = load_csv(opt.test_data)
    dataset = CarlaDataset(configs,
                           data_lambda,
                           intrinsics, [0, 1],
                           4,
                           is_train=False,
                           is_cubemap=opt.mode is Mode.Cubemap,
                           width=opt.width,
                           height=opt.height)
    dataloader = DataLoader(dataset,
                            16,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    if opt.eval_model is None:
        opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)
    else:
        if opt.load_weights_folder is not None:
            raise ValueError(
                "Can't specify eval_model and load_weights_folder, they conflict"
            )

        opt.eval_model = Path(opt.eval_model)
        models = Path(opt.eval_model) / "models"
        weights = [p for p in models.iterdir() if p.name.startswith("weights")]
        weights = [int(p.name.split("_")[1]) for p in weights]
        opt.load_weights_folder = models / f"weights_{max(weights)}"  #

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    pose_encoder = networks.ResnetEncoder(conv_layer, opt.num_layers, False, 2)
    pose_encoder.load_state_dict(un_mod(torch.load(pose_encoder_path)))

    pose_decoder = networks.PoseDecoder(conv_layer, pose_encoder.num_ch_enc, 1,
                                        2)
    pose_decoder.load_state_dict(un_mod(torch.load(pose_decoder_path)))

    if opt.mode is Mode.Cubemap:
        cube_poses = CubePosesAndLoss(include_loss=False)
        cube_poses.cuda()
        cube_poses.eval()

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()

    pred_poses = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    with torch.no_grad():
        for inputs in dataloader:
            for key, ipt in inputs.items():
                inputs[key] = ipt.cuda()

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)

            cam_T_cam = transformation_from_parameters(axisangle[:, 0],
                                                       translation[:, 0])

            if opt.mode is Mode.Cubemap:
                cam_T_cam = cube_poses(cam_T_cam)

            pred_poses.append(cam_T_cam.cpu().numpy())

    pred_poses = np.concatenate(pred_poses)

    ates = []
    num_frames = pred_poses.shape[0]
    gt_poses = get_gt_poses(configs)
    for i in range(0, num_frames - 1):
        gt_pose = next(gt_poses)
        local_xyzs = np.array(dump_xyz(pred_poses[np.newaxis, i]))
        gt_local_xyzs = np.array(dump_xyz(gt_pose[np.newaxis, ...]))

        ates.append(compute_ate(gt_local_xyzs, local_xyzs))

    print("\n   Trajectory error: {:0.3f}, std: {:0.3f}\n".format(
        np.mean(ates), np.std(ates)))

    save_path = os.path.join(opt.load_weights_folder, "poses.npy")
    np.save(save_path, pred_poses)
    print("-> Predictions saved to", save_path)
Example #8
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input == "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc, self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ", self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {'kitti': KITTIRAWDataset,
                         'kitti_odom': KITTIOdomDataset,
                         'FLIR': FlirDataset,
                         'KAIST': KAIST_Dataset}
        
        self.dataset = datasets_dict[self.opt.dataset]
        
        thermal = False
        if self.opt.dataset == 'FLIR':
            train_filenames = []

            train_files = os.listdir(os.path.join(self.opt.data_path, 'train/PreviewData/'))
            train_files.sort()
            train_filenames.extend(os.path.join(self.opt.data_path, 'train/PreviewData/') + 
                                   file for file in train_files[1:-1])

            video_files = os.listdir(os.path.join(self.opt.data_path, 'video/PreviewData/'))
            video_files.sort()
            train_filenames.extend(os.path.join(self.opt.data_path, 'video/PreviewData/') + 
                                   file for file in video_files[1:-1])

            val_filenames = []
            val_files = os.listdir(os.path.join(self.opt.data_path, 'valid/PreviewData/'))
            val_files.sort()
            val_filenames.extend(os.path.join(self.opt.data_path, 'valid/PreviewData/') + 
                                   file for file in val_files[1:-1])
            thermal = True 
        elif self.opt.dataset == 'KAIST':
            train_files = os.path.join(self.opt.data_path, 'training')
            train_filenames = []

            campus_train = os.listdir(os.path.join(train_files, 'Campus/THERMAL/'))
            campus_train.sort()
            residential_train = os.listdir(os.path.join(train_files, 'Residential/THERMAL/'))
            residential_train.sort()
            urban_train = os.listdir(os.path.join(train_files, 'Urban/THERMAL/'))
            urban_train.sort()

            train_filenames.extend(os.path.join(train_files, 'Campus/THERMAL/') +
                                   file for file in campus_train[1:-1])
            train_filenames.extend(os.path.join(train_files, 'Residential/THERMAL/') +
                                   file for file in residential_train[1:-1])
            train_filenames.extend(os.path.join(train_files, 'Urban/THERMAL/') + 
                                   file for file in urban_train[1:-1])
            
            val_files = os.path.join(self.opt.data_path, 'testing')
            val_filenames = []

            campus_val = os.listdir(os.path.join(val_files, 'Campus/THERMAL/'))
            campus_val.sort()
            residential_val = os.listdir(os.path.join(val_files, 'Residential/THERMAL/'))
            residential_val.sort()
            urban_val = os.listdir(os.path.join(val_files, 'Urban/THERMAL/'))
            urban_val.sort()

            val_filenames.extend(os.path.join(val_files, 'Campus/THERMAL/') + 
                                   file for file in campus_val[1:-1])
            val_filenames.extend(os.path.join(val_files, 'Residential/THERMAL/') + 
                                   file for file in residential_val[1:-1])
            val_filenames.extend(os.path.join(val_files, 'Urban/THERMAL/') + 
                                   file for file in urban_val[1:-1])
            thermal = True
        else:
            fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt")
            train_filenames = readlines(fpath.format("train"))
            val_filenames = readlines(fpath.format("val"))
            
        assert (self.opt.img_ext == '.png') or (self.opt.img_ext == '.jpg') or (self.opt.img_ext == '.jpeg'), "Please provide a correct image extension"
        
        img_ext = self.opt.img_ext

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(
            self.opt.data_path, train_filenames, self.opt.height, self.opt.width,
            self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal)
        self.train_loader = DataLoader(
            train_dataset, self.opt.batch_size, True,
            num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)
        val_dataset = self.dataset(
            self.opt.data_path, val_filenames, self.opt.height, self.opt.width,
            self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal = thermal)
        self.val_loader = DataLoader(
            val_dataset, self.opt.batch_size, True,
            num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)
        self.val_iter = iter(self.val_loader)

       # self.writers = {}
       # for mode in ["train", "val"]:
       #     self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2 ** scale)
            w = self.opt.width // (2 ** scale)

            self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"]

        if self.opt.dataset.startswith('kitti'):
            print("Using split:\n  ", self.opt.split)
        else:
            print("Using dataset:\n  ", self.opt.dataset)
        
        print("There are {:d} training items and {:d} validation items\n".format(
            len(train_dataset), len(val_dataset)))

        self.save_opts()
Example #9
0
    def __init__(self, options):
        self.opt = options

        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        Path(self.log_path).mkdir(exist_ok=True, parents=True)
        (Path(self.log_path) / "command").open('w+').write(" ".join(sys.argv))

        # checking height and width are multiples of 32
        # assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        # assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")
        self.parallel = not self.opt.no_cuda and torch.cuda.device_count() > 1

        if self.parallel and self.opt.mode is Mode.Cubemap:
            assert self.opt.batch_size % torch.cuda.device_count() == 0, f"Cubemap batch size ({self.opt.batch_size})" \
                                                                         f" must be evenly divisible by the number of" \
                                                                         f" GPUs ({torch.cuda.device_count()})"

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        conv_layer, data_lambda, intrinsics = get_params(options)
        self.intrinsics = intrinsics

        self.height = self.opt.height or self.intrinsics.height
        self.width = self.opt.width or self.intrinsics.width

        self.models["encoder"] = networks.ResnetEncoder(
            conv_layer, self.opt.num_layers,
            self.opt.weights_init == "pretrained")
        self.store_model("encoder")

        self.models["depth"] = networks.DepthDecoder(
            conv_layer, self.get_num_ch_enc(self.models["encoder"]),
            self.opt.scales)
        self.store_model("depth")

        if self.use_pose_net:  # true
            if self.opt.pose_model_type == "separate_resnet":  # true
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    conv_layer,
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)
                self.store_model("pose_encoder")

                self.models["pose"] = networks.PoseDecoder(
                    conv_layer,
                    self.get_num_ch_enc(self.models["pose_encoder"]),
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    conv_layer, self.get_num_ch_enc(self.models["encoder"]),
                    self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    conv_layer, self.num_input_frames
                    if self.opt.pose_model_input == "all" else 2)

            self.store_model("pose")

        if self.opt.predictive_mask:  # false
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                conv_layer,
                self.get_num_ch_enc(self.models["encoder"]),
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.store_model("predictive_mask")

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print(
            "Training is using:\n  ", f"{self.device}" +
            (f" on {torch.cuda.device_count()} GPUs" if self.parallel else ""))

        num_train_samples = len(load_csv(options.train_data)) * 1000
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset, val_dataset = get_datasets(options, data_lambda,
                                                  intrinsics)

        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = self.wrap_model(SSIM())  # TODO can I parallelize?
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.height // (2**scale)
            w = self.width // (2**scale)

            # TODO should be able to paralalize
            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w, options.mode)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w,
                                               options.mode)
            self.project_3d[scale].to(self.device)

        if options.mode is Mode.Cubemap:
            self.models["cube_pose_and_loss"] = self.wrap_model(
                CubePosesAndLoss())
            self.models["cube_pose_and_loss"].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        self.train_items = len(train_dataset)
        self.val_items = len(val_dataset)

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                self.train_items, self.val_items))

        self.save_opts()
Example #10
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)
        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"
        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else
                                   "cuda")  #指定使用的设备  配合  .to()函数使用  一定要在读取数据之前

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames  #设定pose网络的frames

        assert self.opt.frame_ids[
            0] == 0, "frame_ids must start with 0"  #进行判断,frame的id如果不是从0开始的则报错

        self.use_pose_net = not (
            self.opt.use_stereo and self.opt.frame_ids == [0]
        )  #默认设置了use_stereo则是用的双目,否则就是用单目   !!!且双目的不用多帧!!!

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")  #加s表示是双目的  在最后的一位表示双目

#进行网络的设定,encoder、decoder
        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained",
            self.opt.BA2M, self.opt.CBAM, self.opt.BAM)
        self.models["encoder"].to(self.device)  #一定要在读取数据之前
        self.parameters_to_train += list(
            self.models["encoder"].parameters())  #获取网络的参数!!!!!

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc,
            self.opt.scales)  #num_ch_enc在哪里加进去的????
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(
            self.models["depth"].parameters())  #获取网络的参数!!!!!

        #使用的pose网络
        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":  #确定encoder层是和depth共享还是不是贡献
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    self.opt.BA2M,
                    self.opt.CBAM,
                    self.opt.BAM,
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())  #获取网络的参数!!!!!

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)
                #注意pose网络的decoder部分没有像之前获取保存网络的参数!!!
            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())
            #那是因为最后再进行保存!!!!!!!!!!!!!!!!!!!!!!

#是否使用本文的auto-masking
        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

#进行参数的优化并动态调整学习率
        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1
        )  #调整学习率的  new_lr = 0.1 * lr  调整间隔为shceduler_step_size【也就是epoch】

        #如果要load模型,调用load_model()加载模型
        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data部分
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]  #假如是KITTIRAWDAtaset

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")
        #os.path.join(path, "", "")将几个字符串连接起来当作新的路径
        #os.path.dirname(__file__)  获得当前脚本的绝对路径
        #确定哪种方式进行训练测试

        train_filenames = readlines(
            fpath.format("train"))  #此处的.format连接前面的参数fpath中的{}
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        #得到作为训练和测试的训练样本的名字

        num_train_samples = len(train_filenames)  #训练的总数据量
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs  #计算总格的steps数,每个batch过后将会更新一次参数

        train_dataset = self.dataset(
            self.opt.data_path,
            train_filenames,
            self.opt.height,
            self.opt.width,  #此处是对MonoDataset部分的初始化
            self.opt.frame_ids,
            4,
            is_train=True,
            img_ext=img_ext)  #对数据集进行一定的设定,观察kitti或者kitti_odom中的数据为继承!!!!!
        self.train_loader = DataLoader(
            train_dataset,
            self.opt.batch_size,
            True,
            num_workers=self.opt.num_workers,
            pin_memory=True,
            drop_last=True)  #pin_memory表示锁页内存,显卡里的内存全是锁页内存,里面的内容不会与主机的虚拟内存进行交换
        #加载数据,DataLoader(dataset=torch_dataset,batch_size = BATCH_SIZE, shuffle = True, num_works = 2)
        # shuffle:表示是否打乱数据   num_workd表示多线程 默认线程数为2
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(  #加载data
            val_dataset,
            self.opt.batch_size,
            True,
            num_workers=self.opt.num_workers,
            pin_memory=True,
            drop_last=True)
        self.val_iter = iter(self.val_loader)  #这里不是很理解是什么意思

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()  #此处的SSIM在Layers层中
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)  #BackprojectDepth将depth转化成3D cloud
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()  #将进行的操作保存起来
Example #11
0
def evaluate(opt):
    """Evaluates a pretrained model using a specified test set
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder)

    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    print("-> Loading weights from {}".format(opt.load_weights_folder))

    if viewStereoMask:
        stereoMaskComputer = StereoMask()
        stereoMaskComputer.cuda()

    if viewSurfaceNormal:
        compsurfnorm = ComputeSurfaceNormal(height=opt.height, width=opt.width, batch_size=opt.batch_size)
        compsurfnorm.cuda()

    if viewTypeWiseRegularization:
        typeWReg = TypeWiseRegularization()
        typeWReg.cuda()

    if viewBorderWiseRegularization:
        borderWiseReg = BorderWiseRegularization(batchNum=opt.batch_size, width=opt.width, height=opt.height).cuda()

    if viewMonoMsak:
        monoMask = MonocularMask()
        monoMask.cuda()
    filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt"))
    encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    encoder_dict = torch.load(encoder_path)
    tensor23dPts = Tensor23dPts(height=opt.height, width=opt.width)

    if opt.use_stereo:
        opt.frame_ids.append("s")

    dataset = datasets.KITTIRAWDataset(opt.data_path, filenames,opt.height, opt.width, opt.frame_ids, 4, is_train=False, load_gt_semantics=opt.load_gt_semantics, load_gt_velodine=opt.load_gt_velodine)
    dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers,
                            pin_memory=True, drop_last=True)

    encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_decoder = networks.DepthDecoder(encoder.num_ch_enc)

    model_dict = encoder.state_dict()
    encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict})
    depth_decoder.load_state_dict(torch.load(decoder_path))

    encoder.cuda()
    encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    dirpath = '/media/shengjie/other/sceneUnderstanding/semantic_regularized_unsupervised_depth_estimation/visualization'
    sv_path = os.path.join(dirpath, opt.model_name)
    index = 0

    if viewMonoMsak:
        num_pose_frames = 2
        posenet_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth")
        posenet_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")
        posenet_encoder_dict = torch.load(posenet_encoder_path)
        posenet_decoder_dict = torch.load(posenet_decoder_path)
        posenet_encoder = networks.ResnetEncoder(
            opt.num_layers,
            opt.weights_init == "pretrained",
            num_input_images=num_pose_frames)

        posenet_decoder = networks.PoseDecoder(
            encoder.num_ch_enc,
            num_input_features=1,
            num_frames_to_predict_for=2)
        posenet_encoder.load_state_dict({k: v for k, v in posenet_encoder_dict.items() if k in posenet_encoder_dict})
        posenet_decoder.load_state_dict({k: v for k, v in posenet_decoder_dict.items() if k in posenet_decoder_dict})
        posenet_encoder = posenet_encoder.cuda()
        posenet_decoder = posenet_decoder.cuda()

    if not os.path.exists(sv_path):
        os.makedirs(sv_path)

    with torch.no_grad():
        for idx, inputs in enumerate(dataloader):
            for key, ipt in inputs.items():
                if not(key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta'):
                    inputs[key] = ipt.to(torch.device("cuda"))
            input_color = inputs[("color", 0, 0)]
            features = encoder(input_color)
            outputs = dict()
            outputs.update(depth_decoder(features))

            dispMap = outputs[('disp', 0)]
            scaledDisp, depthMap = disp_to_depth(dispMap, opt.min_depth, opt.max_depth)

            foreGroundMask = torch.ones(scaledDisp.shape, device=torch.device("cuda")).byte()
            scaled_smeantic_label = F.interpolate(inputs[('semantic_label', 0)].cpu().float(), size=(scaledDisp.shape[2], scaledDisp.shape[3]), mode='nearest').cuda().byte()
            for m in foregroundType:
                foreGroundMask = foreGroundMask * (scaled_smeantic_label != m)
            foreGroundMask = (1 - foreGroundMask)
            foreGroundMask = foreGroundMask.float()

            if viewStereoMask:
                scale = 0
                T = inputs["stereo_T"]
                real_scale_disp = scaledDisp * (torch.abs(inputs[("K", scale)][:, 0, 0] * T[:, 0, 3]).view(opt.batch_size, 1, 1, 1).expand_as(scaledDisp))
                stereoMask = stereoMaskComputer.computeMask(real_scale_disp, T[:, 0, 3])
                stereoSemanticalMask = stereoMaskComputer.computeSemanticalMask(stereoMask, foreGroundMask, T[:, 0, 3])
                # stereoMask_fig = tensor2disp(stereoMask, ind=index, vmax=1)
                # stereoSemanticalMask_fig = tensor2disp(stereoSemanticalMask, ind=index, vmax=1)
                # foreGroundMask_fig = tensor2disp(foreGroundMask, ind=index, vmax=1)

            if viewSurfaceNormal:
                surnormMap_fig = compsurfnorm.visualize(depthMap=depthMap, invcamK=inputs['invcamK'], viewindex = index)
                surnormMap = compsurfnorm(depthMap=depthMap, invcamK=inputs['invcamK'])

            if viewTypeWiseRegularization:
                wallType = [2, 3, 4]  # Building, wall, fence
                roadType = [0, 1, 9]  # road, sidewalk, terrain
                permuType = [5, 7]  # Pole, traffic sign
                chanWinSize = 5

                wallMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)
                roadMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)
                permuMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)

                for m in wallType:
                    wallMask = wallMask * (scaled_smeantic_label != m)
                wallMask = 1 - wallMask
                wallMask = wallMask[:, :, 1:-1, 1:-1]

                for m in roadType:
                    roadMask = roadMask * (scaled_smeantic_label != m)
                roadMask = 1 - roadMask
                roadMask = roadMask[:, :, 1:-1, 1:-1]

                for m in permuType:
                    permuMask = permuMask * (scaled_smeantic_label != m)
                permuMask = 1 - permuMask
                permuMask = permuMask[:, :, 1:-1, 1:-1]

                BdErrFig, viewRdErrFig = typeWReg.visualize_regularizeBuildingRoad(surnormMap, wallMask, roadMask,
                                                                                 dispMap, viewInd=index)
                padSize = int((chanWinSize - 1) / 2)
                permuMask = permuMask[:, :, padSize: -padSize, padSize: -padSize]
                surVarFig = typeWReg.visualize_regularizePoleSign(surnormMap, permuMask, dispMap, viewInd=index)

            if viewBorderWiseRegularization:
                wallType = [2, 3, 4]  # Building, wall, fence
                roadType = [0, 1, 9]  # road, sidewalk, terrain
                wallTypeMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)
                roadTypeMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)
                foreGroundMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8)

                for m in wallType:
                    wallTypeMask = wallTypeMask * (scaled_smeantic_label != m)
                wallTypeMask = (1 - wallTypeMask).float()

                for m in roadType:
                    roadTypeMask = roadTypeMask * (scaled_smeantic_label != m)
                roadTypeMask = (1 - roadTypeMask).float()

                for m in foregroundType:
                    foreGroundMask = foreGroundMask * (scaled_smeantic_label != m)
                foreGroundMask = (1 - foreGroundMask).float()

                borderWiseReg.visualize(
                    realDepth=depthMap, dispAct=depthMap,
                    foredgroundMask=foreGroundMask, wallTypeMask=wallTypeMask, groundTypeMask=roadTypeMask,
                    intrinsic=inputs['realIn'], extrinsic=inputs['realEx'], semantic=scaled_smeantic_label, viewInd=0)

            if viewMonoMsak:
                extrinsics = computePose(inputs, opt, depthMap, posenet_encoder, posenet_decoder)
                depthMap_cur = depthMap
                depthMap_prev = computeDepthMap(inputs['color', -1, 0], encoder, depth_decoder, opt.min_depth, opt.max_depth)
                depthMap_next = computeDepthMap(inputs['color', 1, 0], encoder, depth_decoder, opt.min_depth, opt.max_depth)
                pts_cur = depth23dpts(depthMap_cur, inputs['intrinsic'])
                pts_next = depth23dpts(depthMap_prev, inputs['intrinsic'], extrinsics)
                pts_prev = depth23dpts(depthMap_next, inputs['intrinsic'], extrinsics)

            if opt.eval_stereo:
                real_scale_depth = depthMap * STEREO_SCALE_FACTOR
            elif opt.eval_mono:
                ratio = torch.mean(inputs['depth_gt'][inputs['depth_gt'] > 0.1]) / torch.mean(depthMap)
                real_scale_depth = depthMap * ratio

            gtmask = (inputs['depth_gt'] > 0).float()
            gtdepth = inputs['depth_gt']
            velo = inputs['velo']
            tensor23dPts.visualize3d(
                real_scale_depth, ind=index, intrinsic_in=inputs['realIn'], extrinsic_in=inputs['realEx'], gtmask_in=gtmask,
                gtdepth_in=gtdepth, semanticMap=scaled_smeantic_label, velo_in=velo, rgb_in=inputs[('color', 's', 0)],
                disp_in=outputs[('disp', 0)]
                                   )

            suppressed_disp_Map = dispMap * (1 - stereoSemanticalMask)
            semantic_fig = tensor2semantic(inputs[('semantic_label', 0)], ind=index, isGt=True).resize([opt.width, opt.height], pil.NEAREST)
            disp_fig = tensor2disp(dispMap, ind = index)
            suppressed_disp_Map_fig = tensor2disp(suppressed_disp_Map, ind = index)
            rgb_fig = tensor2rgb(inputs[("color", 0, 0)], ind = index)
            combined_fig1 = pil.fromarray((np.array(semantic_fig) * 0.15 + np.array(disp_fig)[:,:,0:3] * 0.85).astype(np.uint8))
            combined_fig2 = pil.fromarray(
                (np.array(rgb_fig) * 0.2 + np.array(disp_fig)[:, :, 0:3] * 0.8).astype(np.uint8))
            combined_fig = pil.fromarray(np.concatenate([np.array(combined_fig1), np.array(combined_fig2), np.array(suppressed_disp_Map_fig)[:,:,0:3], np.array(surnormMap_fig)], axis=0))
            combined_fig.save(os.path.join(sv_path, str(idx) + ".png"))
            print("save %s" % (str(idx) + ".png"))
Example #12
0
    def __init__(self, _host_frame, _target_frame):
        '''
        initialize the randpattern based photometric residual wrapper
        :param _host_frame: numpy ndarray H x W x 3 image.
        :param _target_frame: numpy ndarray image, same dimension as above.
        '''
        # load options
        options = MonodepthOptions()
        opts = options.parse()
        self.opt = opts
        self.num_input_frames = len(self.opt.frame_ids)
        # init model
        self.model_name = "mono_1024x320"

        download_model_if_doesnt_exist(self.model_name)
        self.encoder_path = os.path.join("models", self.model_name,
                                         "encoder.pth")
        self.depth_decoder_path = os.path.join("models", self.model_name,
                                               "depth.pth")
        self.pose_encoder_path = os.path.join("models", self.model_name,
                                              "pose_encoder.pth")
        self.pose_decoder_path = os.path.join("models", self.model_name,
                                              "pose.pth")

        # LOADING PRETRAINED MODEL
        self.encoder = networks.ResnetEncoder(18, False)
        self.depth_decoder = networks.DepthDecoder(
            num_ch_enc=self.encoder.num_ch_enc, scales=range(4))
        self.pose_encoder = networks.ResnetEncoder(self.opt.num_layers, False,
                                                   2)
        # self.pose_encoder = networks.PoseCNN(self.num_input_frames if self.opt.pose_model_input == "all" else 2)
        self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc,
                                                 1, 2)
        # self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, num_input_features=1,
        #                                          num_frames_to_predict_for=2)

        self.loaded_dict_enc = torch.load(self.encoder_path,
                                          map_location='cpu')
        self.filtered_dict_enc = {
            k: v
            for k, v in self.loaded_dict_enc.items()
            if k in self.encoder.state_dict()
        }
        self.encoder.load_state_dict(self.filtered_dict_enc)

        self.loaded_dict_pose_enc = torch.load(self.pose_encoder_path,
                                               map_location='cpu')
        self.filtered_dict_pose_enc = {
            k: v
            for k, v in self.loaded_dict_pose_enc.items()
            if k in self.pose_encoder.state_dict()
        }
        self.pose_encoder.load_state_dict(self.filtered_dict_pose_enc)

        self.loaded_dict = torch.load(self.depth_decoder_path,
                                      map_location='cpu')
        self.depth_decoder.load_state_dict(self.loaded_dict)

        self.loaded_dict_pose = torch.load(self.pose_decoder_path,
                                           map_location='cpu')
        self.pose_decoder.load_state_dict(self.loaded_dict_pose)

        self.encoder.eval()
        self.depth_decoder.eval()

        self.pose_encoder.eval()
        self.pose_decoder.eval()
        self.isgood = []

        # define frames
        self.host_frame = _host_frame
        self.target_frame = _target_frame
        self.host_frame_dx, self.host_frame_dy = image_gradients(
            self.host_frame)
        self.target_frame_dx, self.target_frame_dy = image_gradients(
            self.target_frame)

        # dso's pattern:
        self.residual_pattern = np.array([
            [0, 0],
            [-2, 0],
            [2, 0],
            [-1, -1],
            [1, 1],
            [-1, 1],
            [1, -1],
            [0, 2],
            [0, -2],
        ])
Example #13
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained depth encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained depth decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    # don't try to predict disparity for a disparity image!
    paths = [img for img in paths if not img.endswith("_disp.jpg")]

    if len(paths) > 3:
        print("   Loading Pose network")
        pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
        pose_decoder_path = os.path.join(model_path, "pose.pth")

        pose_encoder = networks.ResnetEncoder(18, False, 2)
        pose_encoder.load_state_dict(torch.load(pose_encoder_path))

        pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
        pose_decoder.load_state_dict(torch.load(pose_decoder_path))

        pose_encoder.to(device)
        pose_encoder.eval()
        pose_decoder.to(device)
        pose_decoder.eval()

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        print("-> Predicting disparities on {:d} test images".format(
            len(paths)))
        processed_images = []
        for idx, image_path in enumerate(paths):
            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            processed_images += [input_image]

            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

        if len(processed_images) > 3:
            pred_poses = []
            rotations = []
            translations = []
            print("-> Predicting poses on {:d} test images".format(
                len(processed_images)))
            for idx, (a, b) in enumerate(
                    zip(processed_images[:-1], processed_images[1:])):
                all_color_aug = torch.cat([a, b], 1)

                features = [pose_encoder(all_color_aug)]
                axisangle, translation = pose_decoder(features)

                rotations += [axisangle[:, 0].cpu().numpy()]
                translations += [translation[:, 0].cpu().numpy()]

                pred_poses.append(
                    transformation_from_parameters(
                        axisangle[:, 0], translation[:, 0]).cpu().numpy())
            pred_poses = np.concatenate(pred_poses)
            save_path = os.path.join(args.image_path, "pred_poses.npy")
            np.save(save_path, pred_poses)
            print("-> Pose Predictions saved to", save_path)
            local_xyzs = np.array(dump_xyz(pred_poses))
            save_path = os.path.join(args.image_path, "pred_xyzs.npy")
            np.save(save_path, local_xyzs)
            print("-> Predicted path saved to", save_path)

            save_path = os.path.join(args.image_path, "axisangle.npy")
            np.save(save_path, np.concatenate(rotations))
            print("-> Predicted axis angles saved to", save_path)
            save_path = os.path.join(args.image_path, "translation.npy")
            np.save(save_path, np.concatenate(translations))
            print("-> Predicted translations saved to", save_path)

    print('-> Done!')
Example #14
0
def main(opt):
    """Evaluate odometry on the KITTI dataset
    """
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    #assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10", \
    #    "eval_split should be either odom_9 or odom_10"

    #sequence_id = int(opt.eval_split.split("_")[1])

    #filenames = readlines(
    #    os.path.join(os.path.dirname(__file__), "splits", "odom",
    #                 "test_files_{:02d}.txt".format(sequence_id)))
    # dataset = KITTIOdomDataset(opt.eval_pose_data_path, filenames, opt.height, opt.width,
    #                            [0, 1], 4, is_train=False)

    filenames = readlines(Path('./splits') / opt.split / 'test_files.txt')

    dataset = CustomMonoDataset(opt.dataset_path,
                                filenames,
                                opt.height,
                                opt.width, [0, 1],
                                1,
                                is_train=False)

    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    #model
    pose_encoder_path = Path(opt.load_weights_folder) / "pose_encoder.pth"
    pose_decoder_path = Path(opt.load_weights_folder) / "pose.pth"

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()

    pred_poses = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    print("-> eval " + opt.split)
    for inputs in tqdm(dataloader):
        for key, ipt in inputs.items():
            inputs[key] = ipt.cuda()

        all_color_aug = torch.cat(
            [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

        features = [pose_encoder(all_color_aug)]
        axisangle, translation = pose_decoder(features)

        pred_pose = transformation_from_parameters(axisangle[:, 0],
                                                   translation[:, 0])
        pred_pose = pred_pose.cpu().numpy()
        pred_poses.append(pred_pose)

    pred_poses = np.concatenate(pred_poses)
    length = pred_poses.shape[0]
    pred_poses.resize([length, 16])
    pred_poses = pred_poses[:, :12]
    filename = opt.dump_name
    np.savetxt(filename, pred_poses, delimiter=' ', fmt='%1.8e')

    print("-> Predictions saved to", filename)
Example #15
0
    def __init__(self, options, joint_training=False):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml"
        cfg.merge_from_file(config_file)
        if joint_training:
            self.joint_training = True
            cfg.merge_from_list(options.opts)
        else:
            self.joint_training = False
            
        cfg.freeze()
        self.cfg = cfg
        # maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth"
        maskrcnn_path = self.opt.maskrcnn_weights
        # maskrcnn_path = "./weights/encoder.pth"

        self.models["encoder"] = networks.ResnetEncoder(
            self.cfg, maskrcnn_path, joint_training=self.joint_training
        )
        self.models["encoder"].to(self.device)
        if self.joint_training:
            self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(scales=self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            self.models["pose"] = networks.PoseDecoder(self.num_pose_frames)
            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            assert self.opt.disable_automasking, \
                "When using predictive_mask, please disable automasking with --disable_automasking"

            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc, self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, self.opt.scheduler_gamma)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ", self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {"kitti": datasets.KITTIRAWDataset,
                         "kitti_odom": datasets.KITTIOdomDataset}
        if self.opt.dataset != 'mixed':
            self.dataset = datasets_dict[self.opt.dataset]
        else:
            self.dataset = datasets.MixedDataset

        fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(
            self.opt.data_path, train_filenames, self.opt.height, self.opt.width,
            self.opt.frame_ids, self.opt.step, self.num_scales, is_train=True, img_ext=img_ext)
        self.train_loader = DataLoader(
            train_dataset, self.opt.batch_size, True,
            num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)
        val_dataset = self.dataset(
            self.opt.data_path, val_filenames, self.opt.height, self.opt.width,
            self.opt.frame_ids, self.opt.step, self.num_scales, is_train=False, img_ext=img_ext)
        self.val_loader = DataLoader(
            val_dataset, self.opt.batch_size, True,
            num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2 ** scale)
            w = self.opt.width // (2 ** scale)

            self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"]

        print("Using split:\n  ", self.opt.split)
        print("There are {:d} training items and {:d} validation items\n".format(
            len(train_dataset), len(val_dataset)))

        self.save_opts()
Example #16
0
def evaluate_pose(opt):
    """Evaluate odometry on the KITTI dataset
    """
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    assert opt.eval_split == "odom_09" or opt.eval_split == "odom_10", \
        "eval_split should be either odom_9 or odom_10"

    device = torch.device("cpu" if opt.no_cuda else "cuda")

    sequence_id = int(opt.eval_split.split("_")[-1])

    if opt.pose_model_input == "pairs":
        opt.frame_ids = [1, 0]  # pose network only takes two frames as input
        num_poses = 1
        filenames = readlines(
            os.path.join(
                os.path.dirname(__file__), "splits", "odom",
                "test_files_{}_{:02d}.txt".format("pairs", sequence_id)))
    else:
        opt.frame_ids = [i for i in opt.frame_ids if i != "s"]
        num_poses = len(opt.frame_ids) - 1
        filenames = readlines(
            os.path.join(
                os.path.dirname(__file__), "splits", "odom",
                "test_files_{}_{:02d}.txt".format("all" + str(num_poses + 1),
                                                  sequence_id)))

    img_ext = '.png' if opt.png else '.jpg'
    dataset = datasets_dict[opt.eval_split](opt.data_path,
                                            filenames,
                                            opt.height,
                                            opt.width,
                                            opt.frame_ids,
                                            4,
                                            is_train=False,
                                            img_ext=img_ext)
    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, num_poses + 1)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, num_poses, 1)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.to(device)
    pose_encoder.eval()
    pose_decoder.to(device)
    pose_decoder.eval()

    pred_poses = []
    flip_pred_poses = []

    print("-> Computing pose predictions")

    with torch.no_grad():
        for inputs in dataloader:
            for key, ipt in inputs.items():
                inputs[key] = ipt.to(device)

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            if opt.post_process:
                # Left-Right Flip as Post-processing to further improve accuracy of pose estimation
                all_color_aug = torch.cat(
                    (all_color_aug, torch.flip(all_color_aug, [3])), 0)

            features = pose_encoder(all_color_aug)
            axisangle, translation = pose_decoder(features)

            if opt.post_process:
                N = axisangle.shape[0] // 2
                pred_poses.append(
                    transformation_from_parameters(
                        axisangle[:N].view(N * num_poses, 1, 3),
                        translation[:N].view(N * num_poses, 1, 3),
                        invert=True).cpu().numpy().reshape(N, num_poses, 4, 4))
                flip_pred_poses.append(
                    transformation_from_parameters(
                        axisangle[N:].view(N * num_poses, 1, 3),
                        translation[N:].view(N * num_poses, 1, 3),
                        invert=True).cpu().numpy().reshape(N, num_poses, 4, 4))
            else:
                N = axisangle.shape[0]
                pred_poses.append(
                    transformation_from_parameters(
                        axisangle.view(N * num_poses, 1, 3),
                        translation.view(N * num_poses, 1, 3),
                        invert=True).cpu().numpy().reshape(N, num_poses, 4, 4))

    pred_poses = np.concatenate(pred_poses)

    if opt.post_process:
        flip_pred_poses = np.concatenate(flip_pred_poses)
        flip_pred_poses[:, :, 1:3, 0] *= -1
        flip_pred_poses[:, :, 0, 1:] *= -1
        pred_poses = average_poses(np.array([pred_poses, flip_pred_poses]))

    gt_poses_path = os.path.join(opt.data_path, "poses",
                                 "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4)
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))
    gt_local_poses = np.expand_dims(np.array(gt_local_poses), axis=1)

    ATEs = []
    REs = []
    num_frames = gt_global_poses.shape[0]
    track_length = 5
    for i in range(0, num_frames - track_length):
        gt_odometry = local_poses_to_odometry(gt_local_poses[i:i +
                                                             track_length - 1])
        pred_odometry = local_poses_to_odometry(pred_poses[i:i + track_length -
                                                           num_poses])
        ATE, RE = compute_pose_error(gt_odometry, pred_odometry)
        ATEs.append(ATE)
        REs.append(RE)

    print("\n Trajectory error: \n"
          "    ATE: {:0.4f}, std: {:0.4f} \n"
          "    RE: {:0.4f}, std: {:0.4f}  \n ".format(np.mean(ATEs),
                                                      np.std(ATEs),
                                                      np.mean(REs),
                                                      np.std(REs)))

    # compute the global monocular visual odometry and save it
    global_pred_odometry = local_poses_to_odometry(pred_poses)

    save_filename = opt.eval_split
    if opt.post_process:
        save_filename = save_filename + "_pp"
    save_path = os.path.join(opt.load_weights_folder, save_filename + ".txt")
    np.savetxt(save_path,
               global_pred_odometry[:, :-1, :].reshape(
                   global_pred_odometry.shape[0], -1),
               delimiter=' ',
               fmt='%1.8e')
    print("-> Predictions saved to", save_path)
Example #17
0
    def __init__(self, options):

        self.opt = options

        self.debug = self.opt.debug
        print('DEBUG: ', self.debug)

        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = True

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.MultiStepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)
        print("Training is using frames: \n  ", self.opt.frame_ids_to_train)

        # data
        datasets_dict = {"nyu": datasets.NYUDataset}
        self.dataset = datasets_dict[self.opt.dataset]

        train_filenames = readlines('./splits/nyu_train_0_10_20_30_40.txt')

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     1,
                                     is_train=True,
                                     segment_path=self.opt.segment_path,
                                     return_segment=True,
                                     shared_dict=shared_dict)

        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)

        # validation
        filenames = readlines('./splits/nyu_test.txt')
        # filenames = [filename.replace("/p300/Code/self_depth/monodepth2/nyuv2/nyu_official",
        #                               self.opt.val_path) for filename in filenames]
        val_dataset = datasets.NYUDataset(self.opt.val_path,
                                          filenames,
                                          self.opt.height,
                                          self.opt.width, [0],
                                          1,
                                          is_train=False,
                                          return_segment=False)
        self.val_dataloader = DataLoader(val_dataset,
                                         1,
                                         shuffle=False,
                                         num_workers=2)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        self.ssim_sparse = SSIM_sparse()
        self.ssim_sparse.to(self.device)

        self.backproject_depth = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), -1))

        self.save_opts()
Example #18
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input == "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc, self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(self.models["predictive_mask"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ", self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # dataset options
        datasets_dict = {'kitti': KITTIRAWDataset,
                         'kitti_odom': KITTIOdomDataset,
                         'FLIR': FlirDataset,
                         'KAIST': KAIST_Dataset,
                         'CREOL': CreolDataset,
                         'all_thermal_data': [FlirDataset, KAIST_Dataset, CreolDataset]}

        assert (self.opt.img_ext == '.png') or (self.opt.img_ext == '.jpg') or (
                    self.opt.img_ext == '.jpeg'), "Please provide a correct image extension"

        img_ext = self.opt.img_ext

        self.dataset = datasets_dict[self.opt.dataset]

        if self.opt.dataset != 'all_thermal_data':
            train_filenames, val_filenames, thermal = get_filenames(self.opt.dataset, self.opt.data_path, self.opt.split)

            num_train_samples = len(train_filenames)
            num_val_samples = len(val_filenames)
            self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

            train_dataset = self.dataset(
                self.opt.data_path, train_filenames, self.opt.height, self.opt.width,
                self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal)

            self.train_loader = DataLoader(
                train_dataset, self.opt.batch_size, True,
                num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)

            val_dataset = self.dataset(
                self.opt.data_path, val_filenames, self.opt.height, self.opt.width,
                self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal = thermal)

            self.val_loader = DataLoader(
                val_dataset, self.opt.batch_size, True,
                num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)

            self.val_iter = iter(self.val_loader)
        else:
            datasets = ['FLIR', 'KAIST', 'CREOL']
            data_paths = ['/groups/mshah/data/FLIR/pre_dat/', '/groups/mshah/data/KAIST_multispectral/', '../robert_video/']

            train_datasets = []
            val_datasets = []

            num_train_samples = 0
            num_val_samples = 0

            for i, dataset in enumerate(self.dataset):
                train_filenames, val_filenames, thermal = get_filenames(datasets[i], data_paths[i], self.opt.split)

                print(datasets[i] + ' train: ' + data_paths[i] + ' - ' + str(len(train_filenames)))
                print(datasets[i] + ' val: ' + data_paths[i] + ' - ' + str(len(val_filenames)))

                num_train_samples += len(train_filenames)
                num_val_samples += len(val_filenames)

                train_datasets.append(dataset(
                    data_paths[i], train_filenames, self.opt.height, self.opt.width,
                    self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal))

                val_datasets.append(dataset(
                    data_paths[i], val_filenames, self.opt.height, self.opt.width,
                    self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal=thermal))

            self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

            self.train_loader = DataLoader(
                ConcatDataset(train_datasets), self.opt.batch_size, True,
                num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)

            self.val_loader = DataLoader(
                ConcatDataset(val_datasets), self.opt.batch_size, True,
                num_workers=self.opt.num_workers, pin_memory=True, drop_last=True)

            self.val_iter = iter(self.val_loader)

       # self.writers = {}
       # for mode in ["train", "val"]:
       #     self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2 ** scale)
            w = self.opt.width // (2 ** scale)

            self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"]

        if self.opt.dataset.startswith('kitti'):
            print("Using split:\n  ", self.opt.split)
        else:
            print("Using dataset:\n  ", self.opt.dataset)
        
        print("There are {:d} training items and {:d} validation items\n".format(
            num_train_samples, num_val_samples))

        self.save_opts()
Example #19
0
    def __init__(self, options):
        self.opt = options
        self.seed_everything()

        # create dirs for logs and predictions if do not exist
        self.log_path = self.opt.log_dir
        if not os.path.exists(self.log_path):
            os.mkdir(self.log_path)
        preds_dir = os.path.join(self.log_path, "preds")
        if not os.path.exists(preds_dir):
            os.mkdir(preds_dir)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        # we don't expect anyone running this on cpu..
        self.device = torch.device("cuda")

        # model initialization
        self.models = {}
        self.parameters_to_train = []

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, True)
        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["pose_encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, True, num_input_images=self.num_input_frames)
        self.models["pose"] = networks.PoseDecoder(
            self.models["pose_encoder"].num_ch_enc,
            num_input_features=1,
            num_frames_to_predict_for=2)

        for _, m in self.models.items():
            m.to(self.device)
            self.parameters_to_train += list(m.parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)

        self.ssim = SSIM()
        self.ssim.to(self.device)

        self.backproject_depth = BackprojectDepth(
            self.opt.batch_size * self.num_scales, self.opt.height,
            self.opt.width)
        self.backproject_depth.to(self.device)

        self.project_3d = Project3D(
            self.opt.batch_size * (self.num_input_frames - 1) *
            self.num_scales, self.opt.height, self.opt.width)
        self.project_3d.to(self.device)

        # save adaptation parameters to the log dir
        self.save_opts()
Example #20
0
def test_depth_pose(args):
    """Function to predict depth and pose
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")
    pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
    pose_decoder_path = os.path.join(model_path, "pose.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained depth encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    print("   Loading pretrained pose encoder")
    pose_encoder = networks.ResnetEncoder(18, False, 2)
    loaded_dict_pose_enc = torch.load(pose_encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)

    pose_encoder.load_state_dict(loaded_dict_pose_enc)

    encoder.to(device)
    pose_encoder.to(device)
    encoder.eval()
    pose_encoder.eval()

    print("   Loading pretrained depth decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    print("   Loading pretrained pose decoder")
    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    loaded_dict_pose = torch.load(pose_decoder_path, map_location=device)
    pose_decoder.load_state_dict(loaded_dict_pose)

    depth_decoder.to(device)
    pose_decoder.to(device)
    depth_decoder.eval()
    pose_decoder.eval()

    print("-> Predicting on test images")

    pred_depths = []
    pred_poses = []

    backproject_depth = BackprojectDepth(1, feed_height, feed_width)
    backproject_depth.to(device)
    project_3d = Project3D(1, feed_height, feed_width)
    project_3d.to(device)

    K = np.array(
        [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)
    K[0, :] *= feed_width
    K[1, :] *= feed_height
    inv_K = np.linalg.pinv(K)

    K = torch.from_numpy(K)
    K = K.unsqueeze(0).to(device)
    inv_K = torch.from_numpy(inv_K)
    inv_K = inv_K.unsqueeze(0).to(device)

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():

        for i in range(107):

            # Load image and preprocess
            image_0_path = './kitti_data/01/{:010d}.jpg'.format(i)
            input_image_0 = Image.open(image_0_path).convert('RGB')
            original_width, original_height = input_image_0.size
            input_image_0 = input_image_0.resize((feed_width, feed_height),
                                                 Image.LANCZOS)
            input_image_0 = transforms.ToTensor()(input_image_0).unsqueeze(0)

            image_1_path = './kitti_data/01/{:010d}.jpg'.format(i + 1)
            input_image_1 = Image.open(image_1_path).convert('RGB')
            input_image_1 = input_image_1.resize((feed_width, feed_height),
                                                 Image.LANCZOS)
            input_image_1 = transforms.ToTensor()(input_image_1).unsqueeze(0)

            # PREDICTION for depth
            input_image_0 = input_image_0.to(device)
            features = encoder(input_image_0)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            #disp_resized = torch.nn.functional.interpolate(
            #    disp, (original_height, original_width), mode="bilinear", align_corners=False)

            _, pred_depth = disp_to_depth(disp, 0.1, 100)
            pred_depth = pred_depth.cpu()[:, 0].numpy()

            pred_depths.append(pred_depth[0])

            print("   Predict Depth {:d}".format(i))

            # PREDICTION for pose
            input_image_1 = input_image_1.to(device)
            input_image_pose = torch.cat([input_image_0, input_image_1], 1)
            features_pose = pose_encoder(input_image_pose)
            features_pose = [features_pose]
            axisangle, translation = pose_decoder(features_pose)

            pred_pose = transformation_from_parameters(axisangle[:, 0],
                                                       translation[:, 0])

            pred_poses.append(pred_pose.cpu()[0].numpy())

            print("   Predict Pose {:d}".format(i))
            print(pred_pose)

            # WARPED image
            if RECONSTRUCTION:
                print("   Reconstruct image {:d}".format(i))
                cam_points = backproject_depth(pred_depth, inv_K)
                pix_coords = project_3d(cam_points, K, pred_pose)
                reconstruct_image_0 = torch.nn.functional.grid_sample(
                    input_image_1, pix_coords, padding_mode="border")
                print("   Saving resonstructed image...")

                reconstruct_image_0 = torch.nn.functional.interpolate(
                    reconstruct_image_0, (original_height, original_width),
                    mode="bilinear",
                    align_corners=False)
                reconstruct_image_0_np = reconstruct_image_0.squeeze().cpu(
                ).numpy()
                reconstruct_image_0_np = (reconstruct_image_0_np * 255).astype(
                    np.uint8)
                reconstruct_image_0_np = np.concatenate([
                    np.expand_dims(reconstruct_image_0_np[i], 2)
                    for i in range(3)
                ], 2)
                im = Image.fromarray(reconstruct_image_0_np, mode='RGB')
                name_dest_im = os.path.join("kitti_data/01", "warped",
                                            "{:010d}_warped.jpg".format(i))
                im.save(name_dest_im)
            print("...")

    np.save('kitti_data/pred_depth_01.npy', np.array(pred_depths))
    np.save('kitti_data/pred_pose_01.npy', np.array(pred_poses))
    print('-> Done!')
Example #21
0
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    K = np.array(
        [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10" or opt.eval_split == "odom_0", \
        "eval_split should be either odom_9 or odom_10"

    sequence_id = int(opt.eval_split.split("_")[1])

    filenames = readlines(
        os.path.join(os.path.dirname(__file__), "splits", "odom",
                     "test_files_{:02d}.txt".format(sequence_id)))

    dataset = KITTIOdomDataset(opt.data_path,
                               filenames,
                               opt.height,
                               opt.width, [0, 1],
                               4,
                               is_train=False)
    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")
    depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))
    depth_encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_encoder_dict = torch.load(depth_encoder_path)
    model_dict = depth_encoder.state_dict()
    depth_encoder.load_state_dict(
        {k: v
         for k, v in depth_encoder_dict.items() if k in model_dict})

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))
    depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc)
    depth_decoder.load_state_dict(torch.load(depth_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()
    depth_encoder.cuda()
    depth_encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    pred_poses = []
    pred_disps = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    with torch.no_grad():
        for inputs in dataloader:
            input_color = inputs[("color", 0, 0)].cuda()
            depth_output = depth_decoder(depth_encoder(input_color))

            pred_disp, _ = disp_to_depth(depth_output[("disp", 0)],
                                         opt.min_depth, opt.max_depth)
            pred_disp = pred_disp.cpu()[:, 0].numpy()

            pred_disps.append(pred_disp)

            for key, ipt in inputs.items():
                inputs[key] = ipt.cuda()

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)

            pred_poses.append(
                transformation_from_parameters(axisangle[:, 0],
                                               translation[:,
                                                           0]).cpu().numpy())

    pred_poses = np.concatenate(pred_poses)
    pred_disps = np.concatenate(pred_disps)
    pred_poses_scaled = []
    ratios_d = []
    gt_norms_div = []
    gt_norms = []
    pred_norms = []
    td_divs_dgc = []
    poses_pred = []
    for i in range(pred_poses.shape[0]):
        pred_pose = pred_poses[i]
        pred_disp = pred_disps[i + 1]
        pred_depth = 1 / pred_disp
        scale_recovery = ScaleRecovery(1, 192, 640, K).cuda()
        pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda()
        ratio = scale_recovery(pred_depth).cpu().item()
        pred_pose_scaled = pred_pose[:3, 3] * ratio
        poses_pred.append(pred_pose[:3, 3])
        pred_poses_scaled.append(pred_pose_scaled)
        ratios_d.append(ratio)

    gt_poses_path = os.path.join(opt.data_path, "poses",
                                 "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4)
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1
    gt_xyzs = gt_global_poses[:, :3, 3]

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.linalg.inv(
                np.dot(np.linalg.inv(gt_global_poses[i - 1]),
                       gt_global_poses[i])))

    ates = []
    num_frames = gt_xyzs.shape[0]
    track_length = 5
    for i in range(0, num_frames - 1):
        local_xyzs = np.array(
            dump_xyz(pred_poses_scaled[i:i + track_length - 1]))
        gt_local_xyzs = np.array(
            dump_xyz(gt_local_poses[i:i + track_length - 1]))
        gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm(
            local_xyzs)
        ates.append(compute_ate(gt_local_xyzs, local_xyzs))
        gt_norms_div.append(gt_norm_div)
        gt_norms.append(np.linalg.norm(gt_local_xyzs))

    print("\n   Trajectory error: {:0.3f}, std: {:0.3f}\n".format(
        np.mean(ates), np.std(ates)))

    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_scaled{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_poses)
    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_gt{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_poses)
    save_path = os.path.join(os.path.dirname(__file__),
                             "poses_pred{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_xyzs)
    save_path = os.path.join(os.path.dirname(__file__),
                             "gt_norms{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_norms)
    save_path = os.path.join(os.path.dirname(__file__),
                             "gt_norms_div{:02d}.npy".format(sequence_id))
    np.save(save_path, gt_norms_div)
    save_path = os.path.join(os.path.dirname(__file__),
                             "ratios_d{:02d}.npy".format(sequence_id))
    np.save(save_path, ratios_d)
    save_path = os.path.join(os.path.dirname(__file__),
                             "pred_norms{:02d}.npy".format(sequence_id))
    np.save(save_path, pred_norms)
    print("-> Predictions saved to", save_path)
Example #22
0
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    MIN_DEPTH = 1e-3
    MAX_DEPTH = 80

    K = np.array(
        [[0.5, 0, 0.5, 0], [0, 1.656, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    filenames = readlines(
        os.path.join(os.path.dirname(__file__), "splits", opt.eval_split,
                     "test_files.txt"))

    dataset = AirSimDataset(opt.data_path,
                            filenames,
                            opt.height,
                            opt.width, [0, 1],
                            4,
                            is_train=False)
    dataloader = DataLoader(dataset,
                            opt.batch_size,
                            shuffle=False,
                            num_workers=opt.num_workers,
                            pin_memory=True,
                            drop_last=False)

    pose_encoder_path = os.path.join(opt.load_weights_folder,
                                     "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")
    depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth")
    depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))
    depth_encoder = networks.ResnetEncoder(opt.num_layers, False)
    depth_encoder_dict = torch.load(depth_encoder_path)
    model_dict = depth_encoder.state_dict()
    depth_encoder.load_state_dict(
        {k: v
         for k, v in depth_encoder_dict.items() if k in model_dict})

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))
    depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc)
    depth_decoder.load_state_dict(torch.load(depth_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()
    depth_encoder.cuda()
    depth_encoder.eval()
    depth_decoder.cuda()
    depth_decoder.eval()

    pred_poses = []
    pred_disps = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    with torch.no_grad():
        for inputs in dataloader:
            input_color = inputs[("color", 0, 0)].cuda()
            depth_output = depth_decoder(depth_encoder(input_color))

            pred_disp, _ = disp_to_depth(depth_output[("disp", 0)],
                                         opt.min_depth, opt.max_depth)
            pred_disp = pred_disp.cpu()[:, 0].numpy()

            pred_disps.append(pred_disp)

            for key, ipt in inputs.items():
                inputs[key] = ipt.cuda()

            all_color_aug = torch.cat(
                [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)

            pred_poses.append(
                transformation_from_parameters(axisangle[:, 0],
                                               translation[:,
                                                           0]).cpu().numpy())

    pred_poses = np.concatenate(pred_poses)
    pred_disps = np.concatenate(pred_disps)

    gt_norms_div = []
    gt_norms = []
    pred_norms = []
    trans_pred = pred_pose[:, :3, 3]

    gt_poses_path = os.path.join(opt.data_path, "poses.txt")
    gt_local_poses = read_pose(gt_poses_path)
    num_frames = gt_local_poses.shape[0]
    for i in range(num_frames):
        local_xyzs = pred_poses[i, :3, 3]
        gt_local_xyzs = gt_local_poses[i, :3, 3]
        gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm(
            local_xyzs)
        gt_norms_div.append(gt_norm_div)

    save_path = os.path.join(os.path.dirname(__file__),
                             "gt_norms_div_AirSim.npy")
    np.save(save_path, gt_norms_div)

    print("-> Predictions saved to", save_path)
Example #23
0
def main_with_masks(args):
    """Function to predict for a single image or folder of images
    """
    print(args.dataset_path)
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    out_path = Path(args.out_path)
    out_path.mkdir_p()
    dirs = {}
    for mask in args.results:
        dirs[mask] = (out_path / mask)
        (out_path / mask).mkdir_p()

    print('-> split:{}'.format(args.split))
    print('-> save to {}'.format(args.out_path))

    if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']:
        feed_height = 192
        feed_width = 640
        min_depth = 0.1
        max_depth = 80
        full_height = 375
        full_width = 1242
        dataset = KITTIRAWDataset

    elif args.split in ["visdrone", "visdrone_lite"]:
        feed_width = 352
        feed_height = 192
        min_depth = 0.1
        max_depth = 255
        dataset = VSDataset
    elif args.split in ['mc', 'mc_lite']:
        feed_height = 288
        feed_width = 384
        min_depth = 0.1
        max_depth = 255
        dataset = MCDataset

    feed_height = 192
    feed_width = 640

    backproject_depth = BackprojectDepth(1, feed_height, feed_width).to(device)

    project_3d = Project3D(1, feed_height, feed_width)

    photometric_error = PhotometricError()

    txt_files = args.txt_files
    #data
    test_path = Path(args.wk_root) / "splits" / args.split / txt_files
    test_filenames = readlines(test_path)
    if args.as_name_sort:  #按照序列顺序名字排列
        test_filenames.sort()
    #check filenames:
    i = 0
    for i, item in enumerate(test_filenames):
        #item = test_filenames[i]
        if args.split in ['eigen', 'custom', 'custom_lite', 'eigen_zhou']:
            dirname, frame, lr = test_filenames[i].split()
            files = (Path(args.dataset_path) / dirname /
                     'image_02/data').files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''
        if args.split in ['mc', 'mc_lite']:  #虽然在split的时候已经处理过了
            block, trajactory, color, frame = test_filenames[i].split('/')
            files = (Path(args.dataset_path) / block / trajactory /
                     color).files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''
            pass
        if args.split in ['visdrone', 'visdrone_lite']:  #虽然在split的时候已经处理过了
            dirname, frame = test_filenames[i].split('/')
            files = (Path(args.dataset_path) / dirname).files()
            files.sort()
            min = int(files[0].stem)
            max = int(files[-1].stem)
            if int(frame) + args.frame_ids[0] <= min or int(
                    frame) + args.frame_ids[-1] >= max:
                test_filenames[i] = ''

    while '' in test_filenames:
        test_filenames.remove('')

    test_dataset = dataset(  # KITTIRAWData
        args.dataset_path,
        test_filenames,
        feed_height,
        feed_width,
        args.frame_ids,
        1,
        is_train=False,
        img_ext=args.ext)

    test_loader = DataLoader(  # train_datasets:KITTIRAWDataset
        dataset=test_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=1,
        pin_memory=True,
        drop_last=False)

    print('->items num: {}'.format(len(test_loader)))

    #layers

    #download_model_if_doesnt_exist(args.model_path,args.model_name)

    model_path = Path(args.model_path) / args.model_name
    if not model_path.exists():
        print(model_path + " does not exists")

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    #1 LOADING PRETRAINED MODEL
    #1.1 encoder
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    #1.2 decoder
    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    #paths
    pose_encoder_path = Path(model_path) / "pose_encoder.pth"
    pose_decoder_path = Path(model_path) / 'pose.pth'

    # 2.1 pose encoder
    print("   Loading pretrained pose encoder")

    pose_encoder = networks.ResnetEncoder(18, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.to(device)
    pose_encoder.eval()

    # 2.2 pose decoder
    print("   Loading pretrained decoder")
    pose_decoder = networks.PoseDecoder(num_ch_enc=pose_encoder.num_ch_enc,
                                        num_input_features=1,
                                        num_frames_to_predict_for=2)

    pose_loaded_dict = torch.load(pose_decoder_path, map_location=device)
    pose_decoder.load_state_dict(pose_loaded_dict)

    pose_decoder.to(device)
    pose_decoder.eval()
    source_scale = 0
    scale = 0
    for batch_idx, inputs in tqdm(enumerate(test_loader)):
        for key, ipt in inputs.items():
            inputs[key] = ipt.to(device)
        features = encoder(inputs[("color", 0, 0)])  # a list from 0 to 4

        outputs = depth_decoder(features)  # dict , 4 disptensor

        disp = outputs[("disp", 0)]  # has a same size with input

        #disp_resized = torch.nn.functional.interpolate(disp, (full_height, full_width), mode="bilinear", align_corners=False)

        _, depth = disp_to_depth(disp, min_depth, max_depth)

        for f_i in [args.frame_ids[0], args.frame_ids[-1]]:

            if f_i < 0:
                pose_inputs = [
                    inputs[("color", f_i, 0)], inputs[("color", 0, 0)]
                ]
            else:
                pose_inputs = [
                    inputs[("color", 0, 0)], inputs[("color", f_i, 0)]
                ]
            pose_inputs = torch.cat(pose_inputs, 1)
            features = pose_encoder(pose_inputs)
            axisangle, translation = pose_decoder([features])

            outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters(
                axisangle[:, 0], translation[:, 0], invert=(f_i < 0))  # b44
            T = outputs[("cam_T_cam", 0, f_i)]

            cam_points = backproject_depth(depth,
                                           inputs[("inv_K", 0)])  # D@K_inv
            pix_coords = project_3d(cam_points, inputs[("K", 0)],
                                    T)  # K@D@K_inv

            outputs[("sample", f_i, 0)] = pix_coords  # rigid_flow

            outputs[("color", f_i,
                     0)] = F.grid_sample(inputs[("color", f_i, 0)],
                                         outputs[("sample", f_i, 0)],
                                         padding_mode="border")
            # output"color" 就是i-warped

            # add a depth warp
            outputs[("color_identity", f_i, 0)] = inputs[("color", f_i, 0)]

        target = inputs[("color", 0, 0)]

        reprojection_losses = []
        for frame_id in [args.frame_ids[0], args.frame_ids[-1]]:
            pred = outputs[("color", frame_id, 0)]
            reprojection_losses.append(photometric_error.run(pred, target))

        reprojection_losses = torch.cat(reprojection_losses, 1)

        identity_reprojection_losses = []
        for frame_id in [args.frame_ids[0], args.frame_ids[-1]]:
            pred = inputs[("color", frame_id, source_scale)]
            identity_reprojection_losses.append(
                photometric_error.run(pred, target))
        identity_reprojection_losses = torch.cat(identity_reprojection_losses,
                                                 1)

        erro_maps = torch.cat(
            (identity_reprojection_losses, reprojection_losses), dim=1)  # b4hw

        identical_mask = IdenticalMask(erro_maps)
        identical_mask = identical_mask[0].detach().cpu().numpy()

        save_name = test_filenames[batch_idx].replace('/', '_')
        save_name = save_name.replace('l', '')
        save_name = save_name.replace('r', '')
        save_name = save_name.replace(' ', '')

        if "identical_mask" in args.results:
            plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name),
                       identical_mask)

        if "depth" in args.results:
            # Saving colormapped depth image
            disp_np = disp[0, 0].detach().cpu().numpy()
            vmax = np.percentile(disp_np, 95)
            plt.imsave(dirs['depth'] / "{}.png".format(save_name),
                       disp_np,
                       cmap='magma',
                       vmax=vmax)

        if "mean_mask" in args.results:
            mean_mask = MeanMask(erro_maps)
            mean_mask = mean_mask[0].detach().cpu().numpy()
            plt.imsave(dirs['mean_mask'] / "{}.png".format(save_name),
                       mean_mask,
                       cmap='bone')

        if "identical_mask" in args.results:
            identical_mask = IdenticalMask(erro_maps)
            identical_mask = identical_mask[0].detach().cpu().numpy()
            plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name),
                       identical_mask,
                       cmap='bone')

        if "var_mask" in args.results:
            var_mask = VarMask(erro_maps)
            var_mask = var_mask[0].detach().cpu().numpy()
            plt.imsave(dirs["var_mask"] / "{}.png".format(save_name),
                       var_mask,
                       cmap='bone')

        if "final_mask" in args.results:
            identical_mask = IdenticalMask(erro_maps)
            mean_mask = MeanMask(erro_maps)
            var_mask = VarMask(erro_maps)
            final_mask = float8or(mean_mask * identical_mask, var_mask)
            final_mask = final_mask[0].detach().cpu().numpy()
            plt.imsave(dirs["final_mask"] / "{}.png".format(save_name),
                       final_mask,
                       cmap='bone')
Example #24
0
    def __init__(self, options):
        self.opt = options
        self.refine = options.refine or options.inv_refine
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)
        self.crop_mode = options.crop_mode

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []
        self.parameters_to_train_refine = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")
        if self.refine:
            self.refine_stage = list(range(options.refine_stage))
            if len(self.refine_stage) > 4:
                self.crop_h = [96, 128, 160, 192, 192]
                self.crop_w = [192, 256, 384, 448, 640]
            else:
                self.crop_h = [96, 128, 160, 192]
                self.crop_w = [192, 256, 384, 640]
            if self.opt.refine_model == 's':
                self.models["mid_refine"] = networks.Simple_Propagate(
                    self.crop_h, self.crop_w, self.crop_mode)
            elif self.opt.refine_model == 'i':
                self.models["mid_refine"] = networks.Iterative_Propagate_old(
                    self.crop_h, self.crop_w, self.crop_mode)
            for param in self.models["mid_refine"].parameters():
                param.requeires_grad = False
            self.models["mid_refine"].to(self.device)
        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers,
            self.opt.weights_init == "pretrained",
            num_input_images=1)
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc,
            self.opt.scales,
            refine=self.refine)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        parameters_to_train = self.parameters_to_train
        self.model_optimizer = optim.Adam(parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()
        if self.refine:
            self.models["encoder_nograd"] = copy.deepcopy(
                self.models["encoder"])
            for param in self.models["encoder_nograd"].parameters():
                param.requeires_grad = False
            self.models["encoder_nograd"].to(self.device)
            self.models["depth_nograd"] = copy.deepcopy(self.models["depth"])
            for param in self.models["depth_nograd"].parameters():
                param.requeires_grad = False
            self.models["depth_nograd"].to(self.device)

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset,
            "kitti_depth": datasets.KITTIDepthDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files_p.txt")
        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext,
                                     refine=False,
                                     crop_mode=self.crop_mode)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext,
                                   refine=False,
                                   crop_mode=self.crop_mode)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()
Example #25
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.toolLayers = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        if self.opt.use_stereo:
            self.opt.frame_ids.append("s")

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")
        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models["depth"] = networks.DepthDecoder(
            self.models["encoder"].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        if self.use_pose_net:
            if self.opt.pose_model_type == "separate_resnet":
                self.models["pose_encoder"] = networks.ResnetEncoder(
                    self.opt.num_layers,
                    self.opt.weights_init == "pretrained",
                    num_input_images=self.num_pose_frames)

                self.models["pose_encoder"].to(self.device)
                self.parameters_to_train += list(
                    self.models["pose_encoder"].parameters())

                self.models["pose"] = networks.PoseDecoder(
                    self.models["pose_encoder"].num_ch_enc,
                    num_input_features=1,
                    num_frames_to_predict_for=2)

            elif self.opt.pose_model_type == "shared":
                self.models["pose"] = networks.PoseDecoder(
                    self.models["encoder"].num_ch_enc, self.num_pose_frames)

            elif self.opt.pose_model_type == "posecnn":
                self.models["pose"] = networks.PoseCNN(
                    self.num_input_frames if self.opt.pose_model_input ==
                    "all" else 2)

            self.models["pose"].to(self.device)
            self.parameters_to_train += list(self.models["pose"].parameters())

        if self.opt.predictive_mask:
            # Our implementation of the predictive masking baseline has the the same architecture
            # as our depth decoder. We predict a separate mask for each source frame.
            self.models["predictive_mask"] = networks.DepthDecoder(
                self.models["encoder"].num_ch_enc,
                self.opt.scales,
                num_output_channels=(len(self.opt.frame_ids) - 1))
            self.models["predictive_mask"].to(self.device)
            self.parameters_to_train += list(
                self.models["predictive_mask"].parameters())

        self.foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18]
        if self.opt.stereo_mask:
            self.toolLayers['compute_stereo_mask'] = StereoMask().cuda()

        if self.opt.typeWiseRegularization:
            self.toolLayers['compsurfnorm'] = ComputeSurfaceNormal(
                height=self.opt.height,
                width=self.opt.width,
                batch_size=self.opt.batch_size).cuda()
            self.toolLayers['typeWReg'] = TypeWiseRegularization().cuda()
            self.wallType = [2, 3, 4]  # Building, wall, fence
            self.roadType = [0, 1, 9]  # road, sidewalk, terrain
            self.permuType = [5, 7]  # Pole, traffic sign
            self.skyType = 10
            self.chanWinSize = 5

        if self.opt.borderWiseRegularization:
            self.wallType = [2, 3, 4]  # Building, wall, fence
            self.roadType = [0, 1, 9]  # road, sidewalk, terrain
            self.toolLayers['borderWiseReg'] = BorderWiseRegularization(
                batchNum=self.opt.batch_size,
                width=self.opt.width,
                height=self.opt.height).cuda()

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)
        self.model_lr_scheduler = optim.lr_scheduler.StepLR(
            self.model_optimizer, self.opt.scheduler_step_size, 0.1)

        if self.opt.load_weights_folder is not None:
            self.load_model()

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {
            "kitti": datasets.KITTIRAWDataset,
            "kitti_odom": datasets.KITTIOdomDataset
        }
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits",
                             self.opt.split, "{}_files.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=True,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)
        val_dataset = self.dataset(self.opt.data_path,
                                   val_filenames,
                                   self.opt.height,
                                   self.opt.width,
                                   self.opt.frame_ids,
                                   4,
                                   is_train=False,
                                   img_ext=img_ext)
        self.val_loader = DataLoader(val_dataset,
                                     self.opt.batch_size,
                                     True,
                                     num_workers=self.opt.num_workers,
                                     pin_memory=True,
                                     drop_last=True)
        self.val_iter = iter(self.val_loader)

        self.writers = {}
        for mode in ["train", "val"]:
            self.writers[mode] = SummaryWriter(
                os.path.join(self.log_path, mode))

        if not self.opt.no_ssim:
            self.ssim = SSIM()
            self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print(
            "There are {:d} training items and {:d} validation items\n".format(
                len(train_dataset), len(val_dataset)))

        self.save_opts()
def evaluate(opt):
    """Evaluate odometry on the KITTI dataset
    """
    assert os.path.isdir(opt.load_weights_folder), \
        "Cannot find a folder at {}".format(opt.load_weights_folder)

    assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10", \
        "eval_split should be either odom_9 or odom_10"

    sequence_id = int(opt.eval_split.split("_")[1])

    filenames = readlines(
        os.path.join(os.path.dirname(__file__), "splits", "odom",
                     "test_files_{:02d}.txt".format(sequence_id)))

    dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width,
                               [0, 1], 4, is_train=False)
    dataloader = DataLoader(dataset, opt.batch_size, shuffle=False,
                            num_workers=opt.num_workers, pin_memory=True, drop_last=False)

    pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth")
    pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth")

    pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()

    pred_poses = []

    print("-> Computing pose predictions")

    opt.frame_ids = [0, 1]  # pose network only takes two frames as input

    with torch.no_grad():
        for inputs in dataloader:
            for key, ipt in inputs.items():
                inputs[key] = ipt.cuda()

            all_color_aug = torch.cat([inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1)

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)

            pred_poses.append(
                transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy())

    pred_poses = np.concatenate(pred_poses)

    gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id))
    gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4)
    gt_global_poses = np.concatenate(
        (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1)
    gt_global_poses[:, 3, 3] = 1
    gt_xyzs = gt_global_poses[:, :3, 3]

    gt_local_poses = []
    for i in range(1, len(gt_global_poses)):
        gt_local_poses.append(
            np.linalg.inv(np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i])))

    ates = []
    num_frames = gt_xyzs.shape[0]
    track_length = 5
    for i in range(0, num_frames - 1):
        local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1]))
        gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i:i + track_length - 1]))

        ates.append(compute_ate(gt_local_xyzs, local_xyzs))

    print("\n   Trajectory error: {:0.3f}, std: {:0.3f}\n".format(np.mean(ates), np.std(ates)))

    save_path = os.path.join(opt.load_weights_folder, "poses.npy")
    np.save(save_path, pred_poses)
    print("-> Predictions saved to", save_path)
Example #27
0
    def __init__(self, options):
        self.opt = options
        self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name)

        # checking height and width are multiples of 32
        assert self.opt.height % 32 == 0, "'height' must be a multiple of 32"
        assert self.opt.width % 32 == 0, "'width' must be a multiple of 32"

        self.models = {}
        self.parameters_to_train = []

        self.device = torch.device("cpu" if self.opt.no_cuda else "cuda")

        self.num_scales = len(self.opt.scales)
        self.num_input_frames = len(self.opt.frame_ids)
        self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames

        assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0"

        self.use_pose_net = not (self.opt.use_stereo
                                 and self.opt.frame_ids == [0])

        self.models["encoder"] = networks.ResnetEncoder(
            self.opt.num_layers, self.opt.weights_init == "pretrained")

        self.models["encoder"].to(self.device)
        self.parameters_to_train += list(self.models["encoder"].parameters())

        self.models['depth'] = networks.DepthDecoder(
            self.models['encoder'].num_ch_enc, self.opt.scales)
        self.models["depth"].to(self.device)
        self.parameters_to_train += list(self.models["depth"].parameters())

        self.models["pose_encoder"] = networks.ResnetEncoder(
            self.opt.num_layers,
            self.opt.weights_init == "pretrained",
            num_input_images=self.num_pose_frames)

        self.models["pose_encoder"].to(self.device)
        self.parameters_to_train += list(
            self.models["pose_encoder"].parameters())

        self.models["pose"] = networks.PoseDecoder(
            self.models["pose_encoder"].num_ch_enc,
            num_input_features=1,
            num_frames_to_predict_for=2)

        self.models["pose"].to(self.device)
        self.parameters_to_train += list(self.models["pose"].parameters())

        self.model_optimizer = optim.Adam(self.parameters_to_train,
                                          self.opt.learning_rate)

        print("Training model named:\n  ", self.opt.model_name)
        print("Models and tensorboard events files are saved to:\n  ",
              self.opt.log_dir)
        print("Training is using:\n  ", self.device)

        # data
        datasets_dict = {"kitti": datasets.KITTIRAWDataset}
        self.dataset = datasets_dict[self.opt.dataset]

        fpath = os.path.join(os.path.dirname(__file__), "splits", "subset.txt")

        train_filenames = readlines(fpath.format("train"))
        val_filenames = readlines(fpath.format("val"))
        img_ext = '.png' if self.opt.png else '.jpg'

        num_train_samples = len(train_filenames)
        self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs

        train_dataset = self.dataset(self.opt.data_path,
                                     train_filenames,
                                     self.opt.height,
                                     self.opt.width,
                                     self.opt.frame_ids,
                                     4,
                                     is_train=False,
                                     img_ext=img_ext)
        self.train_loader = DataLoader(train_dataset,
                                       self.opt.batch_size,
                                       True,
                                       num_workers=self.opt.num_workers,
                                       pin_memory=True,
                                       drop_last=True)

        self.ssim = SSIM()
        self.ssim.to(self.device)

        self.backproject_depth = {}
        self.project_3d = {}
        for scale in self.opt.scales:
            h = self.opt.height // (2**scale)
            w = self.opt.width // (2**scale)

            self.backproject_depth[scale] = BackprojectDepth(
                self.opt.batch_size, h, w)
            self.backproject_depth[scale].to(self.device)

            self.project_3d[scale] = Project3D(self.opt.batch_size, h, w)
            self.project_3d[scale].to(self.device)

        self.depth_metric_names = [
            "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1",
            "da/a2", "da/a3"
        ]

        print("Using split:\n  ", self.opt.split)
        print("There are {:d} training items\n".format(len(train_dataset)))