def prepare_model_for_test(opt): opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda().eval() pose_decoder.cuda().eval() return pose_encoder, pose_decoder
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 # 默认大小为640×192 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") # "scales used in the loss" self.num_scales = len(self.opt.scales) # 默认[0, -1, 1], target 对应id为0 self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") # self.opt.num_layers为encoder部分resnet的深度,默认使用ResNet-18 # 输出5个尺度的features self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) # 三种posenet的处理办法,在论文中的Supplementary Material的Table中有对比结果, # 从表中的结果来看,separate_resnet效果最好,默认选取separate_resnet if self.use_pose_net: # 和depth encoder不共享参数 # pose encoder部分将两张图像在通道维度堆叠为6个通道,输出一个features # pose decoder部分输入一个features,输出两个pose if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) # 和depth encoder共享参数 # encoder部分分别输入一张图像(类似孪生网络) # decoder部分输入两个features,输出一个pose elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) # posecnn为 Learning Depth from Monocular Videos using Direct Methods 中提出的方法, # 参考https://arxiv.org/pdf/1712.00175.pdf elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) # 这个mask对应的是sfmlearner的mask if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) # if set, disables ssim in the loss if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) # save options self.save_opts()
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("Loading pose networks") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() bag_name = '2019-12-17-13-24-03' map_name = "feature=base&ver=2019121700&base_pt=(32.75707,-111.55757)&end_pt=(32.092537212,-110.7892506)" begin = '0:36:00' end = '0:37:00' output_directory = "assets/" dataset = TSDataset(bag_name, begin, end) pred_depth = [] pred_poses = [] last_img = None # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, input_image in enumerate(dataset): # Load image and preprocess original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) pred_depth.append(im) # Handle pose if last_img is None: last_img = input_image all_color_aug = torch.cat([last_img, input_image], 1) last_img = input_image features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy() pred_poses.append(pose) print(" Processed {:d} of {:d} images".format( idx + 1, len(dataset))) pred_poses = np.concatenate(pred_poses, axis=0) print(pred_poses.shape) np.save("poses.npy", pred_poses) # save_video(pred_depth) print('-> Done!')
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data self.dataset = datasets.InteriorDataset fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) sequence_id = int(opt.eval_split.split("_")[1]) opt.batch_size = 1 filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, -1, 1], 4, 1, is_train=False, img_ext='.png') dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) # pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" cfg.merge_from_file(config_file) cfg.freeze() maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth" pose_encoder = networks.ResnetEncoder(cfg, maskrcnn_path) # pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) # pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(len(opt.frame_ids)) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") # opt.frame_ids = [0, 1] # pose network only takes two frames as input ii = 0 with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): if isinstance(ipt, torch.Tensor): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids]) all_features = pose_encoder(all_color_aug) all_features = [ torch.split(f, opt.batch_size) for f in all_features ] features = {} for i, k in enumerate(opt.frame_ids): features[k] = [f[i] for f in all_features] pose_inputs = [features[i] for i in opt.frame_ids if i != "s"] axisangle, translation = pose_decoder(pose_inputs) if ii == 0: pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0], True).cpu().numpy()) pred_poses.append( transformation_from_parameters(axisangle[:, 1], translation[:, 1]).cpu().numpy()) if ii % opt.log_frequency == 0: print("{:04d}-th image processing".format(ii)) ii += 1 # pred_poses.append( # transformation_from_parameters(axisangle[:, 1], translation[:, 1]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) gt_poses_path = os.path.join( "/usr/stud/linp/storage/user/linp/results/kitti", "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape((-1, 3, 4)) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 3 for i in range(0, num_frames - 1): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array( dump_xyz(gt_local_poses[i:i + track_length - 1])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) ''' for i in range(0, num_frames - 2): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i + 1:i + track_length])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) ''' print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) # Depth encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() # Pose pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() if opt.use_imu: imu_lstm = nn.LSTM(6, opt.lstm_hidden_size, opt.lstm_num_layers) imu_lstm.cuda() imu_lstm.eval() lstm_hs = None hidden_to_imu = torch.nn.Sequential( torch.nn.Linear(opt.lstm_hidden_size, 6), ) hidden_to_imu.cuda() hidden_to_imu.eval() if opt.pose_fuse: pose_fuse_mlp = torch.nn.Sequential( torch.nn.Linear(24, opt.pose_mlp_hidden_size), torch.nn.Sigmoid(), torch.nn.Linear(opt.pose_mlp_hidden_size, 6), ) pose_fuse_mlp.cuda() pose_fuse_mlp.eval() img_ext = '.png' if opt.png else '.jpg' pred_disps = [] scale_factors = [] kitty_odom = False if opt.eval_split.startswith("odom"): kitty_odom = True if kitty_odom: ids = [int(opt.eval_split.split("_")[1])] else: splits_dir = os.path.join(os.path.dirname(__file__), "splits") videonames = readlines( os.path.join(splits_dir, opt.eval_split, "test_video_list.txt")) ids = videonames for videoname in ids: if kitty_odom: filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files_{:02d}.txt".format(videoname))) else: filenames = readlines( os.path.join(splits_dir, opt.eval_split, "test_files.txt")) if kitty_odom: dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False, use_imu=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) else: if opt.use_imu: dataset = SequenceRawKittiDataset( opt.data_path, [videoname], filenames, 1, imu_data_path=opt.imu_data_path, img_ext=img_ext, frame_idxs=[0, 1], height=encoder_dict['height'], width=encoder_dict['width'], num_scales=4, is_train=False) dataloader = DataLoader(dataset, shuffle=False, num_workers=0) else: filenames = list( filter(lambda f: f.startswith(videoname), filenames)) dataset = KITTIRAWDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False, use_imu=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) # pred_poses = [np.eye(4).reshape(1, 4, 4)] pred_poses = [] imu_scale_factors = [] print("EVALUATING ", opt.model_name) print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() if opt.use_imu: inputs[key] = inputs[key].squeeze(0) input_color = inputs[("color", 0, 0)] feature = encoder(input_color) output = depth_decoder(feature) pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) all_color_aug = torch.cat([ inputs[("color_aug", i, 0)] for i in sorted(opt.frame_ids) ], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) outputs = {} outputs[("cam_T_cam", 0, 1)] = transformation_from_parameters(axisangle[:, 0], translation[:, 0], invert=False) T = outputs[("cam_T_cam", 0, 1)] if opt.use_imu: outputs = predict_poses_from_imu2(opt, inputs, imu_lstm, lstm_hs, hidden_to_imu) T_better = outputs[("cam_T_cam_imu", 0, 1)] if opt.pose_fuse: fuse_poses(opt, outputs, pose_fuse_mlp) T_better = outputs[("cam_T_cam_fuse", 0, 1)] R, t = rot_translation_from_transformation(T) Rb, tb = rot_translation_from_transformation(T_better) imu_scale_factor = torch.sum(tb * t) / torch.sum(t**2) imu_scale_factors.append(imu_scale_factor.cpu().numpy()) # scale_factors.append(imu_scale_factors) T = T_better pred_poses.append(T.cpu().numpy()) pred_poses = np.concatenate(pred_poses) if opt.eval_split.startswith("odom"): gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(videoname)) else: gt_poses_path = os.path.join(opt.data_path, videoname, "oxts", "poses.txt") eval_pose(opt, pred_poses, gt_poses_path) scale_factors = {} if imu_scale_factors: scale_factors["IMU factor"] = imu_scale_factors pred_disps = np.concatenate(pred_disps) if not kitty_odom: eval_depth(opt, pred_disps, scale_factors)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ conv_layer, data_lambda, intrinsics = get_params(opt) configs = load_csv(opt.test_data) dataset = CarlaDataset(configs, data_lambda, intrinsics, [0, 1], 4, is_train=False, is_cubemap=opt.mode is Mode.Cubemap, width=opt.width, height=opt.height) dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) if opt.eval_model is None: opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) else: if opt.load_weights_folder is not None: raise ValueError( "Can't specify eval_model and load_weights_folder, they conflict" ) opt.eval_model = Path(opt.eval_model) models = Path(opt.eval_model) / "models" weights = [p for p in models.iterdir() if p.name.startswith("weights")] weights = [int(p.name.split("_")[1]) for p in weights] opt.load_weights_folder = models / f"weights_{max(weights)}" # assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(conv_layer, opt.num_layers, False, 2) pose_encoder.load_state_dict(un_mod(torch.load(pose_encoder_path))) pose_decoder = networks.PoseDecoder(conv_layer, pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(un_mod(torch.load(pose_decoder_path))) if opt.mode is Mode.Cubemap: cube_poses = CubePosesAndLoss(include_loss=False) cube_poses.cuda() cube_poses.eval() pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) cam_T_cam = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) if opt.mode is Mode.Cubemap: cam_T_cam = cube_poses(cam_T_cam) pred_poses.append(cam_T_cam.cpu().numpy()) pred_poses = np.concatenate(pred_poses) ates = [] num_frames = pred_poses.shape[0] gt_poses = get_gt_poses(configs) for i in range(0, num_frames - 1): gt_pose = next(gt_poses) local_xyzs = np.array(dump_xyz(pred_poses[np.newaxis, i])) gt_local_xyzs = np.array(dump_xyz(gt_pose[np.newaxis, ...])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list(self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list(self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = {'kitti': KITTIRAWDataset, 'kitti_odom': KITTIOdomDataset, 'FLIR': FlirDataset, 'KAIST': KAIST_Dataset} self.dataset = datasets_dict[self.opt.dataset] thermal = False if self.opt.dataset == 'FLIR': train_filenames = [] train_files = os.listdir(os.path.join(self.opt.data_path, 'train/PreviewData/')) train_files.sort() train_filenames.extend(os.path.join(self.opt.data_path, 'train/PreviewData/') + file for file in train_files[1:-1]) video_files = os.listdir(os.path.join(self.opt.data_path, 'video/PreviewData/')) video_files.sort() train_filenames.extend(os.path.join(self.opt.data_path, 'video/PreviewData/') + file for file in video_files[1:-1]) val_filenames = [] val_files = os.listdir(os.path.join(self.opt.data_path, 'valid/PreviewData/')) val_files.sort() val_filenames.extend(os.path.join(self.opt.data_path, 'valid/PreviewData/') + file for file in val_files[1:-1]) thermal = True elif self.opt.dataset == 'KAIST': train_files = os.path.join(self.opt.data_path, 'training') train_filenames = [] campus_train = os.listdir(os.path.join(train_files, 'Campus/THERMAL/')) campus_train.sort() residential_train = os.listdir(os.path.join(train_files, 'Residential/THERMAL/')) residential_train.sort() urban_train = os.listdir(os.path.join(train_files, 'Urban/THERMAL/')) urban_train.sort() train_filenames.extend(os.path.join(train_files, 'Campus/THERMAL/') + file for file in campus_train[1:-1]) train_filenames.extend(os.path.join(train_files, 'Residential/THERMAL/') + file for file in residential_train[1:-1]) train_filenames.extend(os.path.join(train_files, 'Urban/THERMAL/') + file for file in urban_train[1:-1]) val_files = os.path.join(self.opt.data_path, 'testing') val_filenames = [] campus_val = os.listdir(os.path.join(val_files, 'Campus/THERMAL/')) campus_val.sort() residential_val = os.listdir(os.path.join(val_files, 'Residential/THERMAL/')) residential_val.sort() urban_val = os.listdir(os.path.join(val_files, 'Urban/THERMAL/')) urban_val.sort() val_filenames.extend(os.path.join(val_files, 'Campus/THERMAL/') + file for file in campus_val[1:-1]) val_filenames.extend(os.path.join(val_files, 'Residential/THERMAL/') + file for file in residential_val[1:-1]) val_filenames.extend(os.path.join(val_files, 'Urban/THERMAL/') + file for file in urban_val[1:-1]) thermal = True else: fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) assert (self.opt.img_ext == '.png') or (self.opt.img_ext == '.jpg') or (self.opt.img_ext == '.jpeg'), "Please provide a correct image extension" img_ext = self.opt.img_ext num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset( self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal) self.train_loader = DataLoader( train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset( self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal = thermal) self.val_loader = DataLoader( val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) # self.writers = {} # for mode in ["train", "val"]: # self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2 ** scale) w = self.opt.width // (2 ** scale) self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"] if self.opt.dataset.startswith('kitti'): print("Using split:\n ", self.opt.split) else: print("Using dataset:\n ", self.opt.dataset) print("There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) Path(self.log_path).mkdir(exist_ok=True, parents=True) (Path(self.log_path) / "command").open('w+').write(" ".join(sys.argv)) # checking height and width are multiples of 32 # assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" # assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.parallel = not self.opt.no_cuda and torch.cuda.device_count() > 1 if self.parallel and self.opt.mode is Mode.Cubemap: assert self.opt.batch_size % torch.cuda.device_count() == 0, f"Cubemap batch size ({self.opt.batch_size})" \ f" must be evenly divisible by the number of" \ f" GPUs ({torch.cuda.device_count()})" self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") conv_layer, data_lambda, intrinsics = get_params(options) self.intrinsics = intrinsics self.height = self.opt.height or self.intrinsics.height self.width = self.opt.width or self.intrinsics.width self.models["encoder"] = networks.ResnetEncoder( conv_layer, self.opt.num_layers, self.opt.weights_init == "pretrained") self.store_model("encoder") self.models["depth"] = networks.DepthDecoder( conv_layer, self.get_num_ch_enc(self.models["encoder"]), self.opt.scales) self.store_model("depth") if self.use_pose_net: # true if self.opt.pose_model_type == "separate_resnet": # true self.models["pose_encoder"] = networks.ResnetEncoder( conv_layer, self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.store_model("pose_encoder") self.models["pose"] = networks.PoseDecoder( conv_layer, self.get_num_ch_enc(self.models["pose_encoder"]), num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( conv_layer, self.get_num_ch_enc(self.models["encoder"]), self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( conv_layer, self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.store_model("pose") if self.opt.predictive_mask: # false assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( conv_layer, self.get_num_ch_enc(self.models["encoder"]), self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.store_model("predictive_mask") self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print( "Training is using:\n ", f"{self.device}" + (f" on {torch.cuda.device_count()} GPUs" if self.parallel else "")) num_train_samples = len(load_csv(options.train_data)) * 1000 self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset, val_dataset = get_datasets(options, data_lambda, intrinsics) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = self.wrap_model(SSIM()) # TODO can I parallelize? self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.height // (2**scale) w = self.width // (2**scale) # TODO should be able to paralalize self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w, options.mode) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w, options.mode) self.project_3d[scale].to(self.device) if options.mode is Mode.Cubemap: self.models["cube_pose_and_loss"] = self.wrap_model( CubePosesAndLoss()) self.models["cube_pose_and_loss"].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] self.train_items = len(train_dataset) self.val_items = len(val_dataset) print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( self.train_items, self.val_items)) self.save_opts()
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") #指定使用的设备 配合 .to()函数使用 一定要在读取数据之前 self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames #设定pose网络的frames assert self.opt.frame_ids[ 0] == 0, "frame_ids must start with 0" #进行判断,frame的id如果不是从0开始的则报错 self.use_pose_net = not ( self.opt.use_stereo and self.opt.frame_ids == [0] ) #默认设置了use_stereo则是用的双目,否则就是用单目 !!!且双目的不用多帧!!! if self.opt.use_stereo: self.opt.frame_ids.append("s") #加s表示是双目的 在最后的一位表示双目 #进行网络的设定,encoder、decoder self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", self.opt.BA2M, self.opt.CBAM, self.opt.BAM) self.models["encoder"].to(self.device) #一定要在读取数据之前 self.parameters_to_train += list( self.models["encoder"].parameters()) #获取网络的参数!!!!! self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) #num_ch_enc在哪里加进去的???? self.models["depth"].to(self.device) self.parameters_to_train += list( self.models["depth"].parameters()) #获取网络的参数!!!!! #使用的pose网络 if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": #确定encoder层是和depth共享还是不是贡献 self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", self.opt.BA2M, self.opt.CBAM, self.opt.BAM, num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) #获取网络的参数!!!!! self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) #注意pose网络的decoder部分没有像之前获取保存网络的参数!!! elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) #那是因为最后再进行保存!!!!!!!!!!!!!!!!!!!!!! #是否使用本文的auto-masking if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) #进行参数的优化并动态调整学习率 self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1 ) #调整学习率的 new_lr = 0.1 * lr 调整间隔为shceduler_step_size【也就是epoch】 #如果要load模型,调用load_model()加载模型 if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data部分 datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] #假如是KITTIRAWDAtaset fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") #os.path.join(path, "", "")将几个字符串连接起来当作新的路径 #os.path.dirname(__file__) 获得当前脚本的绝对路径 #确定哪种方式进行训练测试 train_filenames = readlines( fpath.format("train")) #此处的.format连接前面的参数fpath中的{} val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' #得到作为训练和测试的训练样本的名字 num_train_samples = len(train_filenames) #训练的总数据量 self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs #计算总格的steps数,每个batch过后将会更新一次参数 train_dataset = self.dataset( self.opt.data_path, train_filenames, self.opt.height, self.opt.width, #此处是对MonoDataset部分的初始化 self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) #对数据集进行一定的设定,观察kitti或者kitti_odom中的数据为继承!!!!! self.train_loader = DataLoader( train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) #pin_memory表示锁页内存,显卡里的内存全是锁页内存,里面的内容不会与主机的虚拟内存进行交换 #加载数据,DataLoader(dataset=torch_dataset,batch_size = BATCH_SIZE, shuffle = True, num_works = 2) # shuffle:表示是否打乱数据 num_workd表示多线程 默认线程数为2 val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader( #加载data val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) #这里不是很理解是什么意思 self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() #此处的SSIM在Layers层中 self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) #BackprojectDepth将depth转化成3D cloud self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts() #将进行的操作保存起来
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) if viewStereoMask: stereoMaskComputer = StereoMask() stereoMaskComputer.cuda() if viewSurfaceNormal: compsurfnorm = ComputeSurfaceNormal(height=opt.height, width=opt.width, batch_size=opt.batch_size) compsurfnorm.cuda() if viewTypeWiseRegularization: typeWReg = TypeWiseRegularization() typeWReg.cuda() if viewBorderWiseRegularization: borderWiseReg = BorderWiseRegularization(batchNum=opt.batch_size, width=opt.width, height=opt.height).cuda() if viewMonoMsak: monoMask = MonocularMask() monoMask.cuda() filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) tensor23dPts = Tensor23dPts(height=opt.height, width=opt.width) if opt.use_stereo: opt.frame_ids.append("s") dataset = datasets.KITTIRAWDataset(opt.data_path, filenames,opt.height, opt.width, opt.frame_ids, 4, is_train=False, load_gt_semantics=opt.load_gt_semantics, load_gt_velodine=opt.load_gt_velodine) dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) encoder = networks.ResnetEncoder(opt.num_layers, False) depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() dirpath = '/media/shengjie/other/sceneUnderstanding/semantic_regularized_unsupervised_depth_estimation/visualization' sv_path = os.path.join(dirpath, opt.model_name) index = 0 if viewMonoMsak: num_pose_frames = 2 posenet_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") posenet_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") posenet_encoder_dict = torch.load(posenet_encoder_path) posenet_decoder_dict = torch.load(posenet_decoder_path) posenet_encoder = networks.ResnetEncoder( opt.num_layers, opt.weights_init == "pretrained", num_input_images=num_pose_frames) posenet_decoder = networks.PoseDecoder( encoder.num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) posenet_encoder.load_state_dict({k: v for k, v in posenet_encoder_dict.items() if k in posenet_encoder_dict}) posenet_decoder.load_state_dict({k: v for k, v in posenet_decoder_dict.items() if k in posenet_decoder_dict}) posenet_encoder = posenet_encoder.cuda() posenet_decoder = posenet_decoder.cuda() if not os.path.exists(sv_path): os.makedirs(sv_path) with torch.no_grad(): for idx, inputs in enumerate(dataloader): for key, ipt in inputs.items(): if not(key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta'): inputs[key] = ipt.to(torch.device("cuda")) input_color = inputs[("color", 0, 0)] features = encoder(input_color) outputs = dict() outputs.update(depth_decoder(features)) dispMap = outputs[('disp', 0)] scaledDisp, depthMap = disp_to_depth(dispMap, opt.min_depth, opt.max_depth) foreGroundMask = torch.ones(scaledDisp.shape, device=torch.device("cuda")).byte() scaled_smeantic_label = F.interpolate(inputs[('semantic_label', 0)].cpu().float(), size=(scaledDisp.shape[2], scaledDisp.shape[3]), mode='nearest').cuda().byte() for m in foregroundType: foreGroundMask = foreGroundMask * (scaled_smeantic_label != m) foreGroundMask = (1 - foreGroundMask) foreGroundMask = foreGroundMask.float() if viewStereoMask: scale = 0 T = inputs["stereo_T"] real_scale_disp = scaledDisp * (torch.abs(inputs[("K", scale)][:, 0, 0] * T[:, 0, 3]).view(opt.batch_size, 1, 1, 1).expand_as(scaledDisp)) stereoMask = stereoMaskComputer.computeMask(real_scale_disp, T[:, 0, 3]) stereoSemanticalMask = stereoMaskComputer.computeSemanticalMask(stereoMask, foreGroundMask, T[:, 0, 3]) # stereoMask_fig = tensor2disp(stereoMask, ind=index, vmax=1) # stereoSemanticalMask_fig = tensor2disp(stereoSemanticalMask, ind=index, vmax=1) # foreGroundMask_fig = tensor2disp(foreGroundMask, ind=index, vmax=1) if viewSurfaceNormal: surnormMap_fig = compsurfnorm.visualize(depthMap=depthMap, invcamK=inputs['invcamK'], viewindex = index) surnormMap = compsurfnorm(depthMap=depthMap, invcamK=inputs['invcamK']) if viewTypeWiseRegularization: wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain permuType = [5, 7] # Pole, traffic sign chanWinSize = 5 wallMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) roadMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) permuMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) for m in wallType: wallMask = wallMask * (scaled_smeantic_label != m) wallMask = 1 - wallMask wallMask = wallMask[:, :, 1:-1, 1:-1] for m in roadType: roadMask = roadMask * (scaled_smeantic_label != m) roadMask = 1 - roadMask roadMask = roadMask[:, :, 1:-1, 1:-1] for m in permuType: permuMask = permuMask * (scaled_smeantic_label != m) permuMask = 1 - permuMask permuMask = permuMask[:, :, 1:-1, 1:-1] BdErrFig, viewRdErrFig = typeWReg.visualize_regularizeBuildingRoad(surnormMap, wallMask, roadMask, dispMap, viewInd=index) padSize = int((chanWinSize - 1) / 2) permuMask = permuMask[:, :, padSize: -padSize, padSize: -padSize] surVarFig = typeWReg.visualize_regularizePoleSign(surnormMap, permuMask, dispMap, viewInd=index) if viewBorderWiseRegularization: wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain wallTypeMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) roadTypeMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) foreGroundMask = torch.ones((opt.batch_size, 1, opt.height, opt.width), device=torch.device("cuda"), dtype=torch.uint8) for m in wallType: wallTypeMask = wallTypeMask * (scaled_smeantic_label != m) wallTypeMask = (1 - wallTypeMask).float() for m in roadType: roadTypeMask = roadTypeMask * (scaled_smeantic_label != m) roadTypeMask = (1 - roadTypeMask).float() for m in foregroundType: foreGroundMask = foreGroundMask * (scaled_smeantic_label != m) foreGroundMask = (1 - foreGroundMask).float() borderWiseReg.visualize( realDepth=depthMap, dispAct=depthMap, foredgroundMask=foreGroundMask, wallTypeMask=wallTypeMask, groundTypeMask=roadTypeMask, intrinsic=inputs['realIn'], extrinsic=inputs['realEx'], semantic=scaled_smeantic_label, viewInd=0) if viewMonoMsak: extrinsics = computePose(inputs, opt, depthMap, posenet_encoder, posenet_decoder) depthMap_cur = depthMap depthMap_prev = computeDepthMap(inputs['color', -1, 0], encoder, depth_decoder, opt.min_depth, opt.max_depth) depthMap_next = computeDepthMap(inputs['color', 1, 0], encoder, depth_decoder, opt.min_depth, opt.max_depth) pts_cur = depth23dpts(depthMap_cur, inputs['intrinsic']) pts_next = depth23dpts(depthMap_prev, inputs['intrinsic'], extrinsics) pts_prev = depth23dpts(depthMap_next, inputs['intrinsic'], extrinsics) if opt.eval_stereo: real_scale_depth = depthMap * STEREO_SCALE_FACTOR elif opt.eval_mono: ratio = torch.mean(inputs['depth_gt'][inputs['depth_gt'] > 0.1]) / torch.mean(depthMap) real_scale_depth = depthMap * ratio gtmask = (inputs['depth_gt'] > 0).float() gtdepth = inputs['depth_gt'] velo = inputs['velo'] tensor23dPts.visualize3d( real_scale_depth, ind=index, intrinsic_in=inputs['realIn'], extrinsic_in=inputs['realEx'], gtmask_in=gtmask, gtdepth_in=gtdepth, semanticMap=scaled_smeantic_label, velo_in=velo, rgb_in=inputs[('color', 's', 0)], disp_in=outputs[('disp', 0)] ) suppressed_disp_Map = dispMap * (1 - stereoSemanticalMask) semantic_fig = tensor2semantic(inputs[('semantic_label', 0)], ind=index, isGt=True).resize([opt.width, opt.height], pil.NEAREST) disp_fig = tensor2disp(dispMap, ind = index) suppressed_disp_Map_fig = tensor2disp(suppressed_disp_Map, ind = index) rgb_fig = tensor2rgb(inputs[("color", 0, 0)], ind = index) combined_fig1 = pil.fromarray((np.array(semantic_fig) * 0.15 + np.array(disp_fig)[:,:,0:3] * 0.85).astype(np.uint8)) combined_fig2 = pil.fromarray( (np.array(rgb_fig) * 0.2 + np.array(disp_fig)[:, :, 0:3] * 0.8).astype(np.uint8)) combined_fig = pil.fromarray(np.concatenate([np.array(combined_fig1), np.array(combined_fig2), np.array(suppressed_disp_Map_fig)[:,:,0:3], np.array(surnormMap_fig)], axis=0)) combined_fig.save(os.path.join(sv_path, str(idx) + ".png")) print("save %s" % (str(idx) + ".png"))
def __init__(self, _host_frame, _target_frame): ''' initialize the randpattern based photometric residual wrapper :param _host_frame: numpy ndarray H x W x 3 image. :param _target_frame: numpy ndarray image, same dimension as above. ''' # load options options = MonodepthOptions() opts = options.parse() self.opt = opts self.num_input_frames = len(self.opt.frame_ids) # init model self.model_name = "mono_1024x320" download_model_if_doesnt_exist(self.model_name) self.encoder_path = os.path.join("models", self.model_name, "encoder.pth") self.depth_decoder_path = os.path.join("models", self.model_name, "depth.pth") self.pose_encoder_path = os.path.join("models", self.model_name, "pose_encoder.pth") self.pose_decoder_path = os.path.join("models", self.model_name, "pose.pth") # LOADING PRETRAINED MODEL self.encoder = networks.ResnetEncoder(18, False) self.depth_decoder = networks.DepthDecoder( num_ch_enc=self.encoder.num_ch_enc, scales=range(4)) self.pose_encoder = networks.ResnetEncoder(self.opt.num_layers, False, 2) # self.pose_encoder = networks.PoseCNN(self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, 1, 2) # self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, num_input_features=1, # num_frames_to_predict_for=2) self.loaded_dict_enc = torch.load(self.encoder_path, map_location='cpu') self.filtered_dict_enc = { k: v for k, v in self.loaded_dict_enc.items() if k in self.encoder.state_dict() } self.encoder.load_state_dict(self.filtered_dict_enc) self.loaded_dict_pose_enc = torch.load(self.pose_encoder_path, map_location='cpu') self.filtered_dict_pose_enc = { k: v for k, v in self.loaded_dict_pose_enc.items() if k in self.pose_encoder.state_dict() } self.pose_encoder.load_state_dict(self.filtered_dict_pose_enc) self.loaded_dict = torch.load(self.depth_decoder_path, map_location='cpu') self.depth_decoder.load_state_dict(self.loaded_dict) self.loaded_dict_pose = torch.load(self.pose_decoder_path, map_location='cpu') self.pose_decoder.load_state_dict(self.loaded_dict_pose) self.encoder.eval() self.depth_decoder.eval() self.pose_encoder.eval() self.pose_decoder.eval() self.isgood = [] # define frames self.host_frame = _host_frame self.target_frame = _target_frame self.host_frame_dx, self.host_frame_dy = image_gradients( self.host_frame) self.target_frame_dx, self.target_frame_dy = image_gradients( self.target_frame) # dso's pattern: self.residual_pattern = np.array([ [0, 0], [-2, 0], [2, 0], [-1, -1], [1, 1], [-1, 1], [1, -1], [0, 2], [0, -2], ])
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained depth encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained depth decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) # don't try to predict disparity for a disparity image! paths = [img for img in paths if not img.endswith("_disp.jpg")] if len(paths) > 3: print(" Loading Pose network") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() pose_decoder.to(device) pose_decoder.eval() # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): print("-> Predicting disparities on {:d} test images".format( len(paths))) processed_images = [] for idx, image_path in enumerate(paths): # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) processed_images += [input_image] features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) if len(processed_images) > 3: pred_poses = [] rotations = [] translations = [] print("-> Predicting poses on {:d} test images".format( len(processed_images))) for idx, (a, b) in enumerate( zip(processed_images[:-1], processed_images[1:])): all_color_aug = torch.cat([a, b], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) rotations += [axisangle[:, 0].cpu().numpy()] translations += [translation[:, 0].cpu().numpy()] pred_poses.append( transformation_from_parameters( axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) save_path = os.path.join(args.image_path, "pred_poses.npy") np.save(save_path, pred_poses) print("-> Pose Predictions saved to", save_path) local_xyzs = np.array(dump_xyz(pred_poses)) save_path = os.path.join(args.image_path, "pred_xyzs.npy") np.save(save_path, local_xyzs) print("-> Predicted path saved to", save_path) save_path = os.path.join(args.image_path, "axisangle.npy") np.save(save_path, np.concatenate(rotations)) print("-> Predicted axis angles saved to", save_path) save_path = os.path.join(args.image_path, "translation.npy") np.save(save_path, np.concatenate(translations)) print("-> Predicted translations saved to", save_path) print('-> Done!')
def main(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) #assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10", \ # "eval_split should be either odom_9 or odom_10" #sequence_id = int(opt.eval_split.split("_")[1]) #filenames = readlines( # os.path.join(os.path.dirname(__file__), "splits", "odom", # "test_files_{:02d}.txt".format(sequence_id))) # dataset = KITTIOdomDataset(opt.eval_pose_data_path, filenames, opt.height, opt.width, # [0, 1], 4, is_train=False) filenames = readlines(Path('./splits') / opt.split / 'test_files.txt') dataset = CustomMonoDataset(opt.dataset_path, filenames, opt.height, opt.width, [0, 1], 1, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) #model pose_encoder_path = Path(opt.load_weights_folder) / "pose_encoder.pth" pose_decoder_path = Path(opt.load_weights_folder) / "pose.pth" pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input print("-> eval " + opt.split) for inputs in tqdm(dataloader): for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) pred_pose = pred_pose.cpu().numpy() pred_poses.append(pred_pose) pred_poses = np.concatenate(pred_poses) length = pred_poses.shape[0] pred_poses.resize([length, 16]) pred_poses = pred_poses[:, :12] filename = opt.dump_name np.savetxt(filename, pred_poses, delimiter=' ', fmt='%1.8e') print("-> Predictions saved to", filename)
def __init__(self, options, joint_training=False): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") config_file = "./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" cfg.merge_from_file(config_file) if joint_training: self.joint_training = True cfg.merge_from_list(options.opts) else: self.joint_training = False cfg.freeze() self.cfg = cfg # maskrcnn_path = "./e2e_mask_rcnn_R_50_FPN_1x.pth" maskrcnn_path = self.opt.maskrcnn_weights # maskrcnn_path = "./weights/encoder.pth" self.models["encoder"] = networks.ResnetEncoder( self.cfg, maskrcnn_path, joint_training=self.joint_training ) self.models["encoder"].to(self.device) if self.joint_training: self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder(scales=self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: self.models["pose"] = networks.PoseDecoder(self.num_pose_frames) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: assert self.opt.disable_automasking, \ "When using predictive_mask, please disable automasking with --disable_automasking" # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list(self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, self.opt.scheduler_gamma) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = {"kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset} if self.opt.dataset != 'mixed': self.dataset = datasets_dict[self.opt.dataset] else: self.dataset = datasets.MixedDataset fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset( self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, self.opt.step, self.num_scales, is_train=True, img_ext=img_ext) self.train_loader = DataLoader( train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset( self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, self.opt.step, self.num_scales, is_train=False, img_ext=img_ext) self.val_loader = DataLoader( val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2 ** scale) w = self.opt.width // (2 ** scale) self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"] print("Using split:\n ", self.opt.split) print("There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def evaluate_pose(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_09" or opt.eval_split == "odom_10", \ "eval_split should be either odom_9 or odom_10" device = torch.device("cpu" if opt.no_cuda else "cuda") sequence_id = int(opt.eval_split.split("_")[-1]) if opt.pose_model_input == "pairs": opt.frame_ids = [1, 0] # pose network only takes two frames as input num_poses = 1 filenames = readlines( os.path.join( os.path.dirname(__file__), "splits", "odom", "test_files_{}_{:02d}.txt".format("pairs", sequence_id))) else: opt.frame_ids = [i for i in opt.frame_ids if i != "s"] num_poses = len(opt.frame_ids) - 1 filenames = readlines( os.path.join( os.path.dirname(__file__), "splits", "odom", "test_files_{}_{:02d}.txt".format("all" + str(num_poses + 1), sequence_id))) img_ext = '.png' if opt.png else '.jpg' dataset = datasets_dict[opt.eval_split](opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, img_ext=img_ext) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, num_poses + 1) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, num_poses, 1) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() pose_decoder.to(device) pose_decoder.eval() pred_poses = [] flip_pred_poses = [] print("-> Computing pose predictions") with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.to(device) all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) if opt.post_process: # Left-Right Flip as Post-processing to further improve accuracy of pose estimation all_color_aug = torch.cat( (all_color_aug, torch.flip(all_color_aug, [3])), 0) features = pose_encoder(all_color_aug) axisangle, translation = pose_decoder(features) if opt.post_process: N = axisangle.shape[0] // 2 pred_poses.append( transformation_from_parameters( axisangle[:N].view(N * num_poses, 1, 3), translation[:N].view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) flip_pred_poses.append( transformation_from_parameters( axisangle[N:].view(N * num_poses, 1, 3), translation[N:].view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) else: N = axisangle.shape[0] pred_poses.append( transformation_from_parameters( axisangle.view(N * num_poses, 1, 3), translation.view(N * num_poses, 1, 3), invert=True).cpu().numpy().reshape(N, num_poses, 4, 4)) pred_poses = np.concatenate(pred_poses) if opt.post_process: flip_pred_poses = np.concatenate(flip_pred_poses) flip_pred_poses[:, :, 1:3, 0] *= -1 flip_pred_poses[:, :, 0, 1:] *= -1 pred_poses = average_poses(np.array([pred_poses, flip_pred_poses])) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i])) gt_local_poses = np.expand_dims(np.array(gt_local_poses), axis=1) ATEs = [] REs = [] num_frames = gt_global_poses.shape[0] track_length = 5 for i in range(0, num_frames - track_length): gt_odometry = local_poses_to_odometry(gt_local_poses[i:i + track_length - 1]) pred_odometry = local_poses_to_odometry(pred_poses[i:i + track_length - num_poses]) ATE, RE = compute_pose_error(gt_odometry, pred_odometry) ATEs.append(ATE) REs.append(RE) print("\n Trajectory error: \n" " ATE: {:0.4f}, std: {:0.4f} \n" " RE: {:0.4f}, std: {:0.4f} \n ".format(np.mean(ATEs), np.std(ATEs), np.mean(REs), np.std(REs))) # compute the global monocular visual odometry and save it global_pred_odometry = local_poses_to_odometry(pred_poses) save_filename = opt.eval_split if opt.post_process: save_filename = save_filename + "_pp" save_path = os.path.join(opt.load_weights_folder, save_filename + ".txt") np.savetxt(save_path, global_pred_odometry[:, :-1, :].reshape( global_pred_odometry.shape[0], -1), delimiter=' ', fmt='%1.8e') print("-> Predictions saved to", save_path)
def __init__(self, options): self.opt = options self.debug = self.opt.debug print('DEBUG: ', self.debug) self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = True self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.MultiStepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) print("Training is using frames: \n ", self.opt.frame_ids_to_train) # data datasets_dict = {"nyu": datasets.NYUDataset} self.dataset = datasets_dict[self.opt.dataset] train_filenames = readlines('./splits/nyu_train_0_10_20_30_40.txt') num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 1, is_train=True, segment_path=self.opt.segment_path, return_segment=True, shared_dict=shared_dict) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) # validation filenames = readlines('./splits/nyu_test.txt') # filenames = [filename.replace("/p300/Code/self_depth/monodepth2/nyuv2/nyu_official", # self.opt.val_path) for filename in filenames] val_dataset = datasets.NYUDataset(self.opt.val_path, filenames, self.opt.height, self.opt.width, [0], 1, is_train=False, return_segment=False) self.val_dataloader = DataLoader(val_dataset, 1, shuffle=False, num_workers=2) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) self.ssim_sparse = SSIM_sparse() self.ssim_sparse.to(self.device) self.backproject_depth = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), -1)) self.save_opts()
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list(self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list(self.models["predictive_mask"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # dataset options datasets_dict = {'kitti': KITTIRAWDataset, 'kitti_odom': KITTIOdomDataset, 'FLIR': FlirDataset, 'KAIST': KAIST_Dataset, 'CREOL': CreolDataset, 'all_thermal_data': [FlirDataset, KAIST_Dataset, CreolDataset]} assert (self.opt.img_ext == '.png') or (self.opt.img_ext == '.jpg') or ( self.opt.img_ext == '.jpeg'), "Please provide a correct image extension" img_ext = self.opt.img_ext self.dataset = datasets_dict[self.opt.dataset] if self.opt.dataset != 'all_thermal_data': train_filenames, val_filenames, thermal = get_filenames(self.opt.dataset, self.opt.data_path, self.opt.split) num_train_samples = len(train_filenames) num_val_samples = len(val_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset( self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal) self.train_loader = DataLoader( train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset( self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal = thermal) self.val_loader = DataLoader( val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) else: datasets = ['FLIR', 'KAIST', 'CREOL'] data_paths = ['/groups/mshah/data/FLIR/pre_dat/', '/groups/mshah/data/KAIST_multispectral/', '../robert_video/'] train_datasets = [] val_datasets = [] num_train_samples = 0 num_val_samples = 0 for i, dataset in enumerate(self.dataset): train_filenames, val_filenames, thermal = get_filenames(datasets[i], data_paths[i], self.opt.split) print(datasets[i] + ' train: ' + data_paths[i] + ' - ' + str(len(train_filenames))) print(datasets[i] + ' val: ' + data_paths[i] + ' - ' + str(len(val_filenames))) num_train_samples += len(train_filenames) num_val_samples += len(val_filenames) train_datasets.append(dataset( data_paths[i], train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, thermal=thermal)) val_datasets.append(dataset( data_paths[i], val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, thermal=thermal)) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs self.train_loader = DataLoader( ConcatDataset(train_datasets), self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_loader = DataLoader( ConcatDataset(val_datasets), self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) # self.writers = {} # for mode in ["train", "val"]: # self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2 ** scale) w = self.opt.width // (2 ** scale) self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"] if self.opt.dataset.startswith('kitti'): print("Using split:\n ", self.opt.split) else: print("Using dataset:\n ", self.opt.dataset) print("There are {:d} training items and {:d} validation items\n".format( num_train_samples, num_val_samples)) self.save_opts()
def __init__(self, options): self.opt = options self.seed_everything() # create dirs for logs and predictions if do not exist self.log_path = self.opt.log_dir if not os.path.exists(self.log_path): os.mkdir(self.log_path) preds_dir = os.path.join(self.log_path, "preds") if not os.path.exists(preds_dir): os.mkdir(preds_dir) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" # we don't expect anyone running this on cpu.. self.device = torch.device("cuda") # model initialization self.models = {} self.parameters_to_train = [] self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, True) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, True, num_input_images=self.num_input_frames) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) for _, m in self.models.items(): m.to(self.device) self.parameters_to_train += list(m.parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = BackprojectDepth( self.opt.batch_size * self.num_scales, self.opt.height, self.opt.width) self.backproject_depth.to(self.device) self.project_3d = Project3D( self.opt.batch_size * (self.num_input_frames - 1) * self.num_scales, self.opt.height, self.opt.width) self.project_3d.to(self.device) # save adaptation parameters to the log dir self.save_opts()
def test_depth_pose(args): """Function to predict depth and pose """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained depth encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) print(" Loading pretrained pose encoder") pose_encoder = networks.ResnetEncoder(18, False, 2) loaded_dict_pose_enc = torch.load(pose_encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) pose_encoder.load_state_dict(loaded_dict_pose_enc) encoder.to(device) pose_encoder.to(device) encoder.eval() pose_encoder.eval() print(" Loading pretrained depth decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) print(" Loading pretrained pose decoder") pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) loaded_dict_pose = torch.load(pose_decoder_path, map_location=device) pose_decoder.load_state_dict(loaded_dict_pose) depth_decoder.to(device) pose_decoder.to(device) depth_decoder.eval() pose_decoder.eval() print("-> Predicting on test images") pred_depths = [] pred_poses = [] backproject_depth = BackprojectDepth(1, feed_height, feed_width) backproject_depth.to(device) project_3d = Project3D(1, feed_height, feed_width) project_3d.to(device) K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) K[0, :] *= feed_width K[1, :] *= feed_height inv_K = np.linalg.pinv(K) K = torch.from_numpy(K) K = K.unsqueeze(0).to(device) inv_K = torch.from_numpy(inv_K) inv_K = inv_K.unsqueeze(0).to(device) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for i in range(107): # Load image and preprocess image_0_path = './kitti_data/01/{:010d}.jpg'.format(i) input_image_0 = Image.open(image_0_path).convert('RGB') original_width, original_height = input_image_0.size input_image_0 = input_image_0.resize((feed_width, feed_height), Image.LANCZOS) input_image_0 = transforms.ToTensor()(input_image_0).unsqueeze(0) image_1_path = './kitti_data/01/{:010d}.jpg'.format(i + 1) input_image_1 = Image.open(image_1_path).convert('RGB') input_image_1 = input_image_1.resize((feed_width, feed_height), Image.LANCZOS) input_image_1 = transforms.ToTensor()(input_image_1).unsqueeze(0) # PREDICTION for depth input_image_0 = input_image_0.to(device) features = encoder(input_image_0) outputs = depth_decoder(features) disp = outputs[("disp", 0)] #disp_resized = torch.nn.functional.interpolate( # disp, (original_height, original_width), mode="bilinear", align_corners=False) _, pred_depth = disp_to_depth(disp, 0.1, 100) pred_depth = pred_depth.cpu()[:, 0].numpy() pred_depths.append(pred_depth[0]) print(" Predict Depth {:d}".format(i)) # PREDICTION for pose input_image_1 = input_image_1.to(device) input_image_pose = torch.cat([input_image_0, input_image_1], 1) features_pose = pose_encoder(input_image_pose) features_pose = [features_pose] axisangle, translation = pose_decoder(features_pose) pred_pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) pred_poses.append(pred_pose.cpu()[0].numpy()) print(" Predict Pose {:d}".format(i)) print(pred_pose) # WARPED image if RECONSTRUCTION: print(" Reconstruct image {:d}".format(i)) cam_points = backproject_depth(pred_depth, inv_K) pix_coords = project_3d(cam_points, K, pred_pose) reconstruct_image_0 = torch.nn.functional.grid_sample( input_image_1, pix_coords, padding_mode="border") print(" Saving resonstructed image...") reconstruct_image_0 = torch.nn.functional.interpolate( reconstruct_image_0, (original_height, original_width), mode="bilinear", align_corners=False) reconstruct_image_0_np = reconstruct_image_0.squeeze().cpu( ).numpy() reconstruct_image_0_np = (reconstruct_image_0_np * 255).astype( np.uint8) reconstruct_image_0_np = np.concatenate([ np.expand_dims(reconstruct_image_0_np[i], 2) for i in range(3) ], 2) im = Image.fromarray(reconstruct_image_0_np, mode='RGB') name_dest_im = os.path.join("kitti_data/01", "warped", "{:010d}_warped.jpg".format(i)) im.save(name_dest_im) print("...") np.save('kitti_data/pred_depth_01.npy', np.array(pred_depths)) np.save('kitti_data/pred_pose_01.npy', np.array(pred_poses)) print('-> Done!')
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10" or opt.eval_split == "odom_0", \ "eval_split should be either odom_9 or odom_10" sequence_id = int(opt.eval_split.split("_")[1]) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) depth_encoder = networks.ResnetEncoder(opt.num_layers, False) depth_encoder_dict = torch.load(depth_encoder_path) model_dict = depth_encoder.state_dict() depth_encoder.load_state_dict( {k: v for k, v in depth_encoder_dict.items() if k in model_dict}) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc) depth_decoder.load_state_dict(torch.load(depth_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() depth_encoder.cuda() depth_encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_poses = [] pred_disps = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: input_color = inputs[("color", 0, 0)].cuda() depth_output = depth_decoder(depth_encoder(input_color)) pred_disp, _ = disp_to_depth(depth_output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) pred_disps = np.concatenate(pred_disps) pred_poses_scaled = [] ratios_d = [] gt_norms_div = [] gt_norms = [] pred_norms = [] td_divs_dgc = [] poses_pred = [] for i in range(pred_poses.shape[0]): pred_pose = pred_poses[i] pred_disp = pred_disps[i + 1] pred_depth = 1 / pred_disp scale_recovery = ScaleRecovery(1, 192, 640, K).cuda() pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).cuda() ratio = scale_recovery(pred_depth).cpu().item() pred_pose_scaled = pred_pose[:3, 3] * ratio poses_pred.append(pred_pose[:3, 3]) pred_poses_scaled.append(pred_pose_scaled) ratios_d.append(ratio) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv( np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 5 for i in range(0, num_frames - 1): local_xyzs = np.array( dump_xyz(pred_poses_scaled[i:i + track_length - 1])) gt_local_xyzs = np.array( dump_xyz(gt_local_poses[i:i + track_length - 1])) gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm( local_xyzs) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) gt_norms_div.append(gt_norm_div) gt_norms.append(np.linalg.norm(gt_local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format( np.mean(ates), np.std(ates))) save_path = os.path.join(os.path.dirname(__file__), "poses_scaled{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_gt{:02d}.npy".format(sequence_id)) np.save(save_path, pred_poses) save_path = os.path.join(os.path.dirname(__file__), "poses_pred{:02d}.npy".format(sequence_id)) np.save(save_path, gt_xyzs) save_path = os.path.join(os.path.dirname(__file__), "gt_norms{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms) save_path = os.path.join(os.path.dirname(__file__), "gt_norms_div{:02d}.npy".format(sequence_id)) np.save(save_path, gt_norms_div) save_path = os.path.join(os.path.dirname(__file__), "ratios_d{:02d}.npy".format(sequence_id)) np.save(save_path, ratios_d) save_path = os.path.join(os.path.dirname(__file__), "pred_norms{:02d}.npy".format(sequence_id)) np.save(save_path, pred_norms) print("-> Predictions saved to", save_path)
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 K = np.array( [[0.5, 0, 0.5, 0], [0, 1.656, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", opt.eval_split, "test_files.txt")) dataset = AirSimDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") depth_encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") depth_decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) depth_encoder = networks.ResnetEncoder(opt.num_layers, False) depth_encoder_dict = torch.load(depth_encoder_path) model_dict = depth_encoder.state_dict() depth_encoder.load_state_dict( {k: v for k, v in depth_encoder_dict.items() if k in model_dict}) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) depth_decoder = networks.DepthDecoder(depth_encoder.num_ch_enc) depth_decoder.load_state_dict(torch.load(depth_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() depth_encoder.cuda() depth_encoder.eval() depth_decoder.cuda() depth_decoder.eval() pred_poses = [] pred_disps = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: input_color = inputs[("color", 0, 0)].cuda() depth_output = depth_decoder(depth_encoder(input_color)) pred_disp, _ = disp_to_depth(depth_output[("disp", 0)], opt.min_depth, opt.max_depth) pred_disp = pred_disp.cpu()[:, 0].numpy() pred_disps.append(pred_disp) for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat( [inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) pred_disps = np.concatenate(pred_disps) gt_norms_div = [] gt_norms = [] pred_norms = [] trans_pred = pred_pose[:, :3, 3] gt_poses_path = os.path.join(opt.data_path, "poses.txt") gt_local_poses = read_pose(gt_poses_path) num_frames = gt_local_poses.shape[0] for i in range(num_frames): local_xyzs = pred_poses[i, :3, 3] gt_local_xyzs = gt_local_poses[i, :3, 3] gt_norm_div = np.linalg.norm(gt_local_xyzs) / np.linalg.norm( local_xyzs) gt_norms_div.append(gt_norm_div) save_path = os.path.join(os.path.dirname(__file__), "gt_norms_div_AirSim.npy") np.save(save_path, gt_norms_div) print("-> Predictions saved to", save_path)
def main_with_masks(args): """Function to predict for a single image or folder of images """ print(args.dataset_path) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") out_path = Path(args.out_path) out_path.mkdir_p() dirs = {} for mask in args.results: dirs[mask] = (out_path / mask) (out_path / mask).mkdir_p() print('-> split:{}'.format(args.split)) print('-> save to {}'.format(args.out_path)) if args.split in ['custom', 'custom_lite', 'eigen', 'eigen_zhou']: feed_height = 192 feed_width = 640 min_depth = 0.1 max_depth = 80 full_height = 375 full_width = 1242 dataset = KITTIRAWDataset elif args.split in ["visdrone", "visdrone_lite"]: feed_width = 352 feed_height = 192 min_depth = 0.1 max_depth = 255 dataset = VSDataset elif args.split in ['mc', 'mc_lite']: feed_height = 288 feed_width = 384 min_depth = 0.1 max_depth = 255 dataset = MCDataset feed_height = 192 feed_width = 640 backproject_depth = BackprojectDepth(1, feed_height, feed_width).to(device) project_3d = Project3D(1, feed_height, feed_width) photometric_error = PhotometricError() txt_files = args.txt_files #data test_path = Path(args.wk_root) / "splits" / args.split / txt_files test_filenames = readlines(test_path) if args.as_name_sort: #按照序列顺序名字排列 test_filenames.sort() #check filenames: i = 0 for i, item in enumerate(test_filenames): #item = test_filenames[i] if args.split in ['eigen', 'custom', 'custom_lite', 'eigen_zhou']: dirname, frame, lr = test_filenames[i].split() files = (Path(args.dataset_path) / dirname / 'image_02/data').files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' if args.split in ['mc', 'mc_lite']: #虽然在split的时候已经处理过了 block, trajactory, color, frame = test_filenames[i].split('/') files = (Path(args.dataset_path) / block / trajactory / color).files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' pass if args.split in ['visdrone', 'visdrone_lite']: #虽然在split的时候已经处理过了 dirname, frame = test_filenames[i].split('/') files = (Path(args.dataset_path) / dirname).files() files.sort() min = int(files[0].stem) max = int(files[-1].stem) if int(frame) + args.frame_ids[0] <= min or int( frame) + args.frame_ids[-1] >= max: test_filenames[i] = '' while '' in test_filenames: test_filenames.remove('') test_dataset = dataset( # KITTIRAWData args.dataset_path, test_filenames, feed_height, feed_width, args.frame_ids, 1, is_train=False, img_ext=args.ext) test_loader = DataLoader( # train_datasets:KITTIRAWDataset dataset=test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, drop_last=False) print('->items num: {}'.format(len(test_loader))) #layers #download_model_if_doesnt_exist(args.model_path,args.model_name) model_path = Path(args.model_path) / args.model_name if not model_path.exists(): print(model_path + " does not exists") print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") #1 LOADING PRETRAINED MODEL #1.1 encoder print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() #1.2 decoder print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() #paths pose_encoder_path = Path(model_path) / "pose_encoder.pth" pose_decoder_path = Path(model_path) / 'pose.pth' # 2.1 pose encoder print(" Loading pretrained pose encoder") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.to(device) pose_encoder.eval() # 2.2 pose decoder print(" Loading pretrained decoder") pose_decoder = networks.PoseDecoder(num_ch_enc=pose_encoder.num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) pose_loaded_dict = torch.load(pose_decoder_path, map_location=device) pose_decoder.load_state_dict(pose_loaded_dict) pose_decoder.to(device) pose_decoder.eval() source_scale = 0 scale = 0 for batch_idx, inputs in tqdm(enumerate(test_loader)): for key, ipt in inputs.items(): inputs[key] = ipt.to(device) features = encoder(inputs[("color", 0, 0)]) # a list from 0 to 4 outputs = depth_decoder(features) # dict , 4 disptensor disp = outputs[("disp", 0)] # has a same size with input #disp_resized = torch.nn.functional.interpolate(disp, (full_height, full_width), mode="bilinear", align_corners=False) _, depth = disp_to_depth(disp, min_depth, max_depth) for f_i in [args.frame_ids[0], args.frame_ids[-1]]: if f_i < 0: pose_inputs = [ inputs[("color", f_i, 0)], inputs[("color", 0, 0)] ] else: pose_inputs = [ inputs[("color", 0, 0)], inputs[("color", f_i, 0)] ] pose_inputs = torch.cat(pose_inputs, 1) features = pose_encoder(pose_inputs) axisangle, translation = pose_decoder([features]) outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters( axisangle[:, 0], translation[:, 0], invert=(f_i < 0)) # b44 T = outputs[("cam_T_cam", 0, f_i)] cam_points = backproject_depth(depth, inputs[("inv_K", 0)]) # D@K_inv pix_coords = project_3d(cam_points, inputs[("K", 0)], T) # K@D@K_inv outputs[("sample", f_i, 0)] = pix_coords # rigid_flow outputs[("color", f_i, 0)] = F.grid_sample(inputs[("color", f_i, 0)], outputs[("sample", f_i, 0)], padding_mode="border") # output"color" 就是i-warped # add a depth warp outputs[("color_identity", f_i, 0)] = inputs[("color", f_i, 0)] target = inputs[("color", 0, 0)] reprojection_losses = [] for frame_id in [args.frame_ids[0], args.frame_ids[-1]]: pred = outputs[("color", frame_id, 0)] reprojection_losses.append(photometric_error.run(pred, target)) reprojection_losses = torch.cat(reprojection_losses, 1) identity_reprojection_losses = [] for frame_id in [args.frame_ids[0], args.frame_ids[-1]]: pred = inputs[("color", frame_id, source_scale)] identity_reprojection_losses.append( photometric_error.run(pred, target)) identity_reprojection_losses = torch.cat(identity_reprojection_losses, 1) erro_maps = torch.cat( (identity_reprojection_losses, reprojection_losses), dim=1) # b4hw identical_mask = IdenticalMask(erro_maps) identical_mask = identical_mask[0].detach().cpu().numpy() save_name = test_filenames[batch_idx].replace('/', '_') save_name = save_name.replace('l', '') save_name = save_name.replace('r', '') save_name = save_name.replace(' ', '') if "identical_mask" in args.results: plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name), identical_mask) if "depth" in args.results: # Saving colormapped depth image disp_np = disp[0, 0].detach().cpu().numpy() vmax = np.percentile(disp_np, 95) plt.imsave(dirs['depth'] / "{}.png".format(save_name), disp_np, cmap='magma', vmax=vmax) if "mean_mask" in args.results: mean_mask = MeanMask(erro_maps) mean_mask = mean_mask[0].detach().cpu().numpy() plt.imsave(dirs['mean_mask'] / "{}.png".format(save_name), mean_mask, cmap='bone') if "identical_mask" in args.results: identical_mask = IdenticalMask(erro_maps) identical_mask = identical_mask[0].detach().cpu().numpy() plt.imsave(dirs['identical_mask'] / "{}.png".format(save_name), identical_mask, cmap='bone') if "var_mask" in args.results: var_mask = VarMask(erro_maps) var_mask = var_mask[0].detach().cpu().numpy() plt.imsave(dirs["var_mask"] / "{}.png".format(save_name), var_mask, cmap='bone') if "final_mask" in args.results: identical_mask = IdenticalMask(erro_maps) mean_mask = MeanMask(erro_maps) var_mask = VarMask(erro_maps) final_mask = float8or(mean_mask * identical_mask, var_mask) final_mask = final_mask[0].detach().cpu().numpy() plt.imsave(dirs["final_mask"] / "{}.png".format(save_name), final_mask, cmap='bone')
def __init__(self, options): self.opt = options self.refine = options.refine or options.inv_refine self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) self.crop_mode = options.crop_mode # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.parameters_to_train_refine = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") if self.refine: self.refine_stage = list(range(options.refine_stage)) if len(self.refine_stage) > 4: self.crop_h = [96, 128, 160, 192, 192] self.crop_w = [192, 256, 384, 448, 640] else: self.crop_h = [96, 128, 160, 192] self.crop_w = [192, 256, 384, 640] if self.opt.refine_model == 's': self.models["mid_refine"] = networks.Simple_Propagate( self.crop_h, self.crop_w, self.crop_mode) elif self.opt.refine_model == 'i': self.models["mid_refine"] = networks.Iterative_Propagate_old( self.crop_h, self.crop_w, self.crop_mode) for param in self.models["mid_refine"].parameters(): param.requeires_grad = False self.models["mid_refine"].to(self.device) self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=1) self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, refine=self.refine) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) parameters_to_train = self.parameters_to_train self.model_optimizer = optim.Adam(parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() if self.refine: self.models["encoder_nograd"] = copy.deepcopy( self.models["encoder"]) for param in self.models["encoder_nograd"].parameters(): param.requeires_grad = False self.models["encoder_nograd"].to(self.device) self.models["depth_nograd"] = copy.deepcopy(self.models["depth"]) for param in self.models["depth_nograd"].parameters(): param.requeires_grad = False self.models["depth_nograd"].to(self.device) print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset, "kitti_depth": datasets.KITTIDepthDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files_p.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext, refine=False, crop_mode=self.crop_mode) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext, refine=False, crop_mode=self.crop_mode) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.toolLayers = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) if self.opt.use_stereo: self.opt.frame_ids.append("s") self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models["depth"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) if self.use_pose_net: if self.opt.pose_model_type == "separate_resnet": self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) elif self.opt.pose_model_type == "shared": self.models["pose"] = networks.PoseDecoder( self.models["encoder"].num_ch_enc, self.num_pose_frames) elif self.opt.pose_model_type == "posecnn": self.models["pose"] = networks.PoseCNN( self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) if self.opt.predictive_mask: # Our implementation of the predictive masking baseline has the the same architecture # as our depth decoder. We predict a separate mask for each source frame. self.models["predictive_mask"] = networks.DepthDecoder( self.models["encoder"].num_ch_enc, self.opt.scales, num_output_channels=(len(self.opt.frame_ids) - 1)) self.models["predictive_mask"].to(self.device) self.parameters_to_train += list( self.models["predictive_mask"].parameters()) self.foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18] if self.opt.stereo_mask: self.toolLayers['compute_stereo_mask'] = StereoMask().cuda() if self.opt.typeWiseRegularization: self.toolLayers['compsurfnorm'] = ComputeSurfaceNormal( height=self.opt.height, width=self.opt.width, batch_size=self.opt.batch_size).cuda() self.toolLayers['typeWReg'] = TypeWiseRegularization().cuda() self.wallType = [2, 3, 4] # Building, wall, fence self.roadType = [0, 1, 9] # road, sidewalk, terrain self.permuType = [5, 7] # Pole, traffic sign self.skyType = 10 self.chanWinSize = 5 if self.opt.borderWiseRegularization: self.wallType = [2, 3, 4] # Building, wall, fence self.roadType = [0, 1, 9] # road, sidewalk, terrain self.toolLayers['borderWiseReg'] = BorderWiseRegularization( batchNum=self.opt.batch_size, width=self.opt.width, height=self.opt.height).cuda() self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) self.model_lr_scheduler = optim.lr_scheduler.StepLR( self.model_optimizer, self.opt.scheduler_step_size, 0.1) if self.opt.load_weights_folder is not None: self.load_model() print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = { "kitti": datasets.KITTIRAWDataset, "kitti_odom": datasets.KITTIOdomDataset } self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) val_dataset = self.dataset(self.opt.data_path, val_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.val_loader = DataLoader(val_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.val_iter = iter(self.val_loader) self.writers = {} for mode in ["train", "val"]: self.writers[mode] = SummaryWriter( os.path.join(self.log_path, mode)) if not self.opt.no_ssim: self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print( "There are {:d} training items and {:d} validation items\n".format( len(train_dataset), len(val_dataset))) self.save_opts()
def evaluate(opt): """Evaluate odometry on the KITTI dataset """ assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) assert opt.eval_split == "odom_9" or opt.eval_split == "odom_10", \ "eval_split should be either odom_9 or odom_10" sequence_id = int(opt.eval_split.split("_")[1]) filenames = readlines( os.path.join(os.path.dirname(__file__), "splits", "odom", "test_files_{:02d}.txt".format(sequence_id))) dataset = KITTIOdomDataset(opt.data_path, filenames, opt.height, opt.width, [0, 1], 4, is_train=False) dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() pred_poses = [] print("-> Computing pose predictions") opt.frame_ids = [0, 1] # pose network only takes two frames as input with torch.no_grad(): for inputs in dataloader: for key, ipt in inputs.items(): inputs[key] = ipt.cuda() all_color_aug = torch.cat([inputs[("color_aug", i, 0)] for i in opt.frame_ids], 1) features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pred_poses.append( transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) pred_poses = np.concatenate(pred_poses) gt_poses_path = os.path.join(opt.data_path, "poses", "{:02d}.txt".format(sequence_id)) gt_global_poses = np.loadtxt(gt_poses_path).reshape(-1, 3, 4) gt_global_poses = np.concatenate( (gt_global_poses, np.zeros((gt_global_poses.shape[0], 1, 4))), 1) gt_global_poses[:, 3, 3] = 1 gt_xyzs = gt_global_poses[:, :3, 3] gt_local_poses = [] for i in range(1, len(gt_global_poses)): gt_local_poses.append( np.linalg.inv(np.dot(np.linalg.inv(gt_global_poses[i - 1]), gt_global_poses[i]))) ates = [] num_frames = gt_xyzs.shape[0] track_length = 5 for i in range(0, num_frames - 1): local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i:i + track_length - 1])) ates.append(compute_ate(gt_local_xyzs, local_xyzs)) print("\n Trajectory error: {:0.3f}, std: {:0.3f}\n".format(np.mean(ates), np.std(ates))) save_path = os.path.join(opt.load_weights_folder, "poses.npy") np.save(save_path, pred_poses) print("-> Predictions saved to", save_path)
def __init__(self, options): self.opt = options self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) # checking height and width are multiples of 32 assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" self.models = {} self.parameters_to_train = [] self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") self.num_scales = len(self.opt.scales) self.num_input_frames = len(self.opt.frame_ids) self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) self.models["encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained") self.models["encoder"].to(self.device) self.parameters_to_train += list(self.models["encoder"].parameters()) self.models['depth'] = networks.DepthDecoder( self.models['encoder'].num_ch_enc, self.opt.scales) self.models["depth"].to(self.device) self.parameters_to_train += list(self.models["depth"].parameters()) self.models["pose_encoder"] = networks.ResnetEncoder( self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=self.num_pose_frames) self.models["pose_encoder"].to(self.device) self.parameters_to_train += list( self.models["pose_encoder"].parameters()) self.models["pose"] = networks.PoseDecoder( self.models["pose_encoder"].num_ch_enc, num_input_features=1, num_frames_to_predict_for=2) self.models["pose"].to(self.device) self.parameters_to_train += list(self.models["pose"].parameters()) self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) print("Training model named:\n ", self.opt.model_name) print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) print("Training is using:\n ", self.device) # data datasets_dict = {"kitti": datasets.KITTIRAWDataset} self.dataset = datasets_dict[self.opt.dataset] fpath = os.path.join(os.path.dirname(__file__), "splits", "subset.txt") train_filenames = readlines(fpath.format("train")) val_filenames = readlines(fpath.format("val")) img_ext = '.png' if self.opt.png else '.jpg' num_train_samples = len(train_filenames) self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs train_dataset = self.dataset(self.opt.data_path, train_filenames, self.opt.height, self.opt.width, self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) self.train_loader = DataLoader(train_dataset, self.opt.batch_size, True, num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) self.ssim = SSIM() self.ssim.to(self.device) self.backproject_depth = {} self.project_3d = {} for scale in self.opt.scales: h = self.opt.height // (2**scale) w = self.opt.width // (2**scale) self.backproject_depth[scale] = BackprojectDepth( self.opt.batch_size, h, w) self.backproject_depth[scale].to(self.device) self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) self.project_3d[scale].to(self.device) self.depth_metric_names = [ "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3" ] print("Using split:\n ", self.opt.split) print("There are {:d} training items\n".format(len(train_dataset)))