def get_inference_input_dict_ros(self, info, points): assert self.anchor_cache is not None assert self.target_assigner is not None assert self.voxel_generator is not None assert self.config is not None assert self.built is True rect = info['calib/R0_rect'] P2 = info['calib/P2'] Trv2c = info['calib/Tr_velo_to_cam'] input_cfg = self.config.eval_input_reader model_cfg = self.config.model.second input_dict = { 'points': points, 'rect': rect, 'Trv2c': Trv2c, 'P2': P2, 'image_shape': np.array(info["img_shape"], dtype=np.int32), # 'image_idx': info['image_idx'], # 'image_path': info['img_path'], # 'pointcloud_num_features': num_point_features, } out_size_factor = model_cfg.rpn.layer_strides[ 0] // model_cfg.rpn.upsample_strides[0] example = prep_pointcloud( input_dict=input_dict, root_path=str(self.root_path), voxel_generator=self.voxel_generator, target_assigner=self.target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=self.target_assigner.classes, training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=self.anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32) # example["image_idx"] = info['image_idx'] example["image_shape"] = input_dict["image_shape"] example["points"] = points if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype(np.uint8) ############# # convert example to batched example ############# example = merge_second_batch([example]) return example
def get_inference_input_dict(self, points): assert self.anchor_cache is not None assert self.target_assigner is not None assert self.voxel_generator is not None assert self.config is not None assert self.built is True input_cfg = self.config.eval_input_reader model_cfg = self.config.model.second input_dict = { 'points': points, } out_size_factor = model_cfg.rpn.layer_strides[0] // model_cfg.rpn.upsample_strides[0] example = prep_pointcloud( input_dict=input_dict, root_path=str(self.root_path), voxel_generator=self.voxel_generator, target_assigner=self.target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=list(input_cfg.class_names), training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=self.anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32) # example["image_idx"] = info['image_idx'] # example["image_shape"] = input_dict["image_shape"] example["points"] = points if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype(np.uint8) ############# # convert example to batched example ############# example = merge_second_batch([example]) return example
def test(config_path=args.config_path, model_dir=args.model_dir, result_path=None, create_folder=False, pickle_result=True, include_roadmap=False, device=1): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config batch_size = 1 class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) grid_size = voxel_generator.grid_size ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner, include_roadmap) net.cuda().eval() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) #torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) torchplus.train.restore(args.model_path, net) #torchplus.train.restore("./ped_models_56/voxelnet-275130.tckpt",net) out_size_factor = model_cfg.rpn.layer_strides[ 0] / model_cfg.rpn.upsample_strides[0] print(out_size_factor) #out_size_factor *= model_cfg.middle_feature_extractor.downsample_factor out_size_factor = int(out_size_factor) feature_map_size = grid_size[:2] // out_size_factor feature_map_size = [*feature_map_size, 1][::-1] print(feature_map_size) ret = target_assigner.generate_anchors(feature_map_size) #anchors_dict = target_assigner.generate_anchors_dict(feature_map_size) anchors = ret["anchors"] anchors = anchors.reshape([-1, 7]) matched_thresholds = ret["matched_thresholds"] unmatched_thresholds = ret["unmatched_thresholds"] anchors_bv = box_np_ops.rbbox2d_to_near_bbox(anchors[:, [0, 1, 3, 4, 6]]) anchor_cache = { "anchors": anchors, "anchors_bv": anchors_bv, "matched_thresholds": matched_thresholds, "unmatched_thresholds": unmatched_thresholds, #"anchors_dict": anchors_dict, } am = ArgoverseMap() dt_annos = [] root_dir = os.path.join('./../../argodataset/argoverse-tracking/', args.set) argoverse_loader = ArgoverseTrackingLoader(root_dir) prog_cnt = 0 for seq in range(len(argoverse_loader)): argoverse_data = argoverse_loader[seq] nlf = argoverse_data.num_lidar_frame for frame in range(nlf): prog_cnt += 1 if prog_cnt % 50 == 0: print(prog_cnt) points = argoverse_data.get_lidar(frame) roi_pts = copy.deepcopy(points) city_name = argoverse_data.city_name city_to_egovehicle_se3 = argoverse_data.get_pose(frame) ''' roi_pts = city_to_egovehicle_se3.transform_point_cloud(roi_pts) # put into city coords #non roi roi_pts_flag = am.remove_non_roi_points(roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] roi_pts = am.remove_ground_surface(roi_pts, city_name) # remove ground surface # convert city to lidar co-ordinates roi_pts = city_to_egovehicle_se3.inverse_transform_point_cloud(roi_pts) ''' if args.include_roi or args.dr_area or not args.include_road_points: roi_pts = city_to_egovehicle_se3.transform_point_cloud( roi_pts) # put into city coords if args.include_roi: roi_pts_flag = am.remove_non_roi_points( roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] if not args.include_roi and args.dr_area: roi_pts_flag = am.remove_non_driveable_area_points( roi_pts, city_name) # remove non-driveable region roi_pts = roi_pts[roi_pts_flag] if not args.include_road_points: roi_pts = am.remove_ground_surface( roi_pts, city_name) # remove ground surface # convert city to lidar co-ordinates if args.include_roi or args.dr_area or not args.include_road_points: roi_pts = city_to_egovehicle_se3.inverse_transform_point_cloud( roi_pts) roi_pts[:, 2] = roi_pts[:, 2] - 1.73 pts_x, pts_y, pts_z = roi_pts[:, 0], roi_pts[:, 1], roi_pts[:, 2] input_dict = { 'points': roi_pts, 'pointcloud_num_features': 3, } out_size_factor = model_cfg.rpn.layer_strides[ 0] // model_cfg.rpn.upsample_strides[0] example = prep_pointcloud( input_dict=input_dict, root_path=None, voxel_generator=voxel_generator, target_assigner=target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=list(input_cfg.class_names), training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32) if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype( np.uint8) example["image_idx"] = str(seq) + "_" + str(frame) example["image_shape"] = np.array([400, 400], dtype=np.int32) example["road_map"] = None example["include_roadmap"] = False example["points"] = roi_pts #torch.save(example,"./network_input_examples/" + info) example = merge_second_batch([example]) example_torch = example_convert_to_torch(example, device=args.device) try: result_annos = predict_kitti_to_anno( net, example_torch, input_cfg.class_names, model_cfg.post_center_limit_range, model_cfg.lidar_input) except: print(seq, frame) continue dt_annos += result_annos if pickle_result: sdi = args.save_path.rfind('/') save_dir = args.save_path[:sdi] if not os.path.exists(save_dir): os.mkdir(save_dir) with open(args.save_path, 'wb') as f: pickle.dump(dt_annos, f)
def main(config_path, lc_horizon, num_examples, model_dir, ckpt_path=None, **kwargs): """Don't support pickle_result anymore. if you want to generate kitti label file, please use kitti_anno_to_label_file and convert_detection_to_kitti_annos in second.data.kitti_dataset. """ assert len(kwargs) == 0 model_dir = str(Path(model_dir).resolve()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader input_cfg.cum_lc_wrapper.lc_horizon = lc_horizon model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time=False).to(device) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator if ckpt_path is None: assert model_dir is not None torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) batch_size = 1 eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner, net=net) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() t = time.time() detections = [] print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() times = [] for scene_id in trange(num_examples): idx = eval_dataset.scene_id_and_step_to_idx(scene_id, lc_horizon) torch.cuda.synchronize() b_ex_time = time.time() example = eval_dataset[idx] example = merge_second_batch([example]) example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections = net(example) torch.cuda.synchronize() e_ex_time = time.time() del example, detections times.append(e_ex_time - b_ex_time) times = np.array(times) mean = times.mean() interval = 1.96 * times.std() / np.sqrt( len(times)) # 95% confidence interval return mean, interval
def get_inference_input_dict(self, info, points): assert self.anchor_cache is not None assert self.target_assigner is not None assert self.voxel_generator is not None assert self.config is not None assert self.built is True kitti.convert_to_kitti_info_version2(info) pc_info = info["point_cloud"] image_info = info["image"] calib = info["calib"] rect = calib['R0_rect'] Trv2c = calib['Tr_velo_to_cam'] P2 = calib['P2'] input_cfg = self.config.eval_input_reader model_cfg = self.config.model.second input_dict = { 'points': points, "calib": { 'rect': rect, 'Trv2c': Trv2c, 'P2': P2, }, "image": { 'image_shape': np.array(image_info["image_shape"], dtype=np.int32), 'image_idx': image_info['image_idx'], 'image_path': image_info['image_path'], }, } out_size_factor = np.prod(model_cfg.rpn.layer_strides) if len(model_cfg.rpn.upsample_strides) > 0: out_size_factor /= model_cfg.rpn.upsample_strides[-1] out_size_factor *= model_cfg.middle_feature_extractor.downsample_factor out_size_factor = int(out_size_factor) example = prep_pointcloud( input_dict=input_dict, root_path=str(self.root_path), voxel_generator=self.voxel_generator, target_assigner=self.target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=self.target_assigner.classes, training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=self.anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32) example["metadata"] = {} if "image" in info: example["metadata"]["image"] = input_dict["image"] if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype(np.uint8) ############# # convert example to batched example ############# example = merge_second_batch([example]) return example
def get_inference_input_dict(self, info, points): # assert self.anchor_cache is not None # assert self.target_assigner is not None # assert self.voxel_generator is not None assert self.fv_generator is not None assert self.config is not None assert self.built is True rect = info['calib/R0_rect'] P2 = info['calib/P2'] Trv2c = info['calib/Tr_velo_to_cam'] input_cfg = self.config.eval_input_reader model_cfg = self.config.model.second root_path = '/home/js/data/KITTI/object' input_dict = { 'points': points, 'rect': rect, 'Trv2c': Trv2c, 'P2': P2, 'image_shape': np.array(info["img_shape"], dtype=np.int32), 'image_idx': info['image_idx'], 'image_path': root_path + '/' + info['img_path'], # 'pointcloud_num_features': num_point_features, } if 'annos' in info: annos = info['annos'] # we need other objects to avoid collision when sample annos = kitti.remove_dontcare(annos) loc = annos["location"] dims = annos["dimensions"] rots = annos["rotation_y"] # alpha = annos["alpha"] gt_names = annos["name"] # print(gt_names, len(loc)) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1).astype(np.float32) # gt_boxes = np.concatenate( # [loc, dims, alpha[..., np.newaxis]], axis=1).astype(np.float32) # gt_boxes = box_np_ops.box_camera_to_lidar(gt_boxes, rect, Trv2c) difficulty = annos["difficulty"] input_dict.update({ 'gt_boxes': gt_boxes, 'gt_names': gt_names, 'difficulty': difficulty, }) if 'group_ids' in annos: input_dict['group_ids'] = annos["group_ids"] out_size_factor = model_cfg.rpn.layer_strides[ 0] // model_cfg.rpn.upsample_strides[0] print("RGB_embedding: ", self.RGB_embedding) example = prep_pointcloud( input_dict=input_dict, root_path=str(self.root_path), # voxel_generator=self.voxel_generator, fv_generator=self.fv_generator, target_assigner=self.target_assigner, max_voxels=input_cfg.max_number_of_voxels, class_names=list(input_cfg.class_names), training=False, create_targets=False, shuffle_points=input_cfg.shuffle_points, generate_bev=False, remove_outside_points=False, without_reflectivity=model_cfg.without_reflectivity, num_point_features=model_cfg.num_point_features, anchor_area_threshold=input_cfg.anchor_area_threshold, anchor_cache=self.anchor_cache, out_size_factor=out_size_factor, out_dtype=np.float32, num_classes=model_cfg.num_class, RGB_embedding=self.RGB_embedding) example["image_idx"] = info['image_idx'] example["image_shape"] = input_dict["image_shape"] example["points"] = points if "anchors_mask" in example: example["anchors_mask"] = example["anchors_mask"].astype(np.uint8) ############# # convert example to batched example ############# example = merge_second_batch([example]) return example
def slicing_forward(self, dataset_iter, deadline): self.measure_time_start('Pre-stage-1', False) self.measure_time_start('PFE') #self.measure_time_start('PillarGen') try: if self._repeat_example: example = merge_second_batch([ self._data_loader.dataset[self._repeat_example_idx] ]) # id 19 else: example = next(dataset_iter) except StopIteration: print("Woaaaah, that is unexpected! Check dataset iter!") return None, None, None num_voxels = example["num_voxels"][0][0] # batch size 1 example = example_convert_to_torch(example, self._float_dtype) #self.measure_time_end('PillarGen') torch.backends.cudnn.benchmark = False io_dict = self._net.forward_pfn(example) torch.backends.cudnn.benchmark = self._cudnn_benchmarking self.measure_time_end('PFE') #with torch.cuda.stream(self._other_cuda_stream): # Calculate anchor mask stg0_sum = torch.sum(io_dict['stage0'], 1, keepdim=True) sum_mask = torch.nn.functional.max_pool2d( stg0_sum, 15, stride=int(self._stg0_pred_scale_rate), padding=7).type(torch.bool) example['anchors_mask'] = sum_mask.expand( self._box_preds_size[:-1]).contiguous() sum_del_mask = torch.unsqueeze(torch.logical_not(sum_mask), -1) sum_del_mask = sum_del_mask.expand(self._cls_preds_size).contiguous() #self.measure_time_start("RPN-total") # Returns possible batch sizes for reach stage # self.measure_time_start('RPN-stage-1') self._net.forward_rpn_stage(io_dict) self._net.forward_rpn_cls_preds(io_dict) self.measure_time_end('RPN-stage-1') # Calculate sum of class scores within each slice # but only use class scores positioned close to pillar locations # use stg0 to create the pillar mask which will be used for slicing # Apply sigmoid and mask values below nms threshold cls_scores = torch.sigmoid(io_dict["cls_preds"]) cls_scores_del = cls_scores <= self._nms_score_threshold #torch.cuda.default_stream().wait_stream(self._other_cuda_stream) cls_scores_del = torch.logical_or(cls_scores_del, sum_del_mask) cls_scores.masked_scatter_(cls_scores_del, self._pred_zeroer) if not self._net._encode_background_as_zeros: cls_scores = cls_scores[..., 1:].contiguous() cls_scores = torch.sum(cls_scores, [0, 1, 2, 4]) # reduce to H csa = self.slice_with_ranges(cls_scores.cpu(), self._cls_scr_ranges) # LOOKS LIKE IT IS SYNCHED AT THIS POINT if not self._merge_preds: anchors = example['anchors'].view(self._box_preds_size) aa = self.slice_preds_with_ranges(anchors, self._preds_slc_ranges) ama = self.slice_with_ranges(example['anchors_mask'], self._preds_slc_ranges) slice_io_dicts = [] for i in range(self._num_slices): slice_io_dicts.append({}) if not self._merge_preds: slice_io_dicts[-1]['anchors'] = aa[i] slice_io_dicts[-1]['anchors_mask'] = ama[i] # Get the cls mask of each slice, also the overlapped regions explicitly # stg1 class scores will be enough for everything cls_scr_sums = torch.empty(2 * len(slice_io_dicts) - 1, dtype=cls_scores.dtype, device='cpu') for i, cs in enumerate(csa): cls_scr_sums[i] = torch.sum(cs) zerocuk_tensor = cls_scr_sums.new_zeros((1, )) slice_io_dicts[0]['cls_scores'] = torch.cat( (zerocuk_tensor, cls_scr_sums[:2])) slice_io_dicts[-1]['cls_scores'] = torch.cat( (cls_scr_sums[-2:], zerocuk_tensor)) for i, io_d in zip(range(1, len(cls_scr_sums) - 2, 2), slice_io_dicts[1:-1]): io_d['cls_scores'] = cls_scr_sums[i:i + 3] # I DON'T NEED TO CALL SYNC BECAUSE IT IS ALREADY SYNCED # FROM WHAT I SAW BUT DO IT ANYWAY, NO BIG LOSS torch.cuda.synchronize() # Now decide the slice forwarding pattern # This algorithm takes 0.5 ms slices_to_exec = self.sched_slices(slice_io_dicts, deadline) stg2_slices, stg3_slices = slices_to_exec stg_seq = [1] self.measure_time_end('Pre-stage-1', False) self.measure_time_start('Post-stage-1', False) data_sliced = False if len(stg2_slices) == self._num_slices: # Since we are going to execute all slices, # Don't do slicing and run the whole stage self.measure_time_start("RPN-stage-2") self._net.forward_rpn_stage(io_dict) self.measure_time_end(f"RPN-stage-2") elif len(stg2_slices) > 0: data_sliced = True # Slice the tensors sa = self.slice_with_ranges(io_dict["stage1"], self._stg1_slc_ranges) ua = self.slice_with_ranges(io_dict["up1"], self._up1_slc_ranges) cpa = self.slice_preds_with_ranges(io_dict["cls_preds"], self._preds_slc_ranges) for i in range(self._num_slices): slice_io_dicts[i]["stages_executed"] = 1 slice_io_dicts[i]["stage1"] = sa[i] slice_io_dicts[i]["up1"] = ua[i] slice_io_dicts[i]["backbone_out"] = ua[i] slice_io_dicts[i]["cls_preds"] = cpa[i] # We have slices to exec through stage 2 # batch the chosen slices batch_io_dict = { "stages_executed": 1, } batch_io_dict["stage1"] = torch.cat( [slice_io_dicts[s]["stage1"] for s in stg2_slices]) #batch_io_dict["up1"] = torch.cat( # [slice_io_dicts[s]["up1"] for s in stg2_slices]) self.measure_time_start("RPN-stage-2") self._net.forward_rpn_stage(batch_io_dict) self.measure_time_end(f"RPN-stage-2") # Scatter the results anyway #if len(stg3_slices) < len(stg2_slices): stg2_chunks = torch.chunk(batch_io_dict["stage2"], len(stg2_slices)) up2_chunks = torch.chunk(batch_io_dict["up2"], len(stg2_slices)) for i, s in enumerate(stg2_slices): slice_io_dicts[s]["stage2"] = stg2_chunks[i] slice_io_dicts[s]["up2"] = up2_chunks[i] slice_io_dicts[s]["stages_executed"] = 2 stg_seq.extend([2] * len(stg2_slices)) if len(stg3_slices) == self._num_slices: # data_sliced will be always false # at this point since stage2 slices # will be also equal to _num_slices self.measure_time_start("RPN-stage-3") self._net.forward_rpn_stage(io_dict) self.measure_time_end(f"RPN-stage-3") elif len(stg3_slices) > 0: # that means stg2_slices was also > 0 data_sliced = True if len(stg2_slices) == self._num_slices: # Slice the tensors if they were not sliced during stage 2 sa = self.slice_with_ranges(io_dict["stage2"], self._stg2_slc_ranges) ua1 = self.slice_with_ranges(io_dict["up1"], self._up1_slc_ranges) ua2 = self.slice_with_ranges(io_dict["up2"], self._up2_slc_ranges) for i in range(self._num_slices): slice_io_dicts[i]["stage2"] = sa[i] slice_io_dicts[i]["up1"] = ua1[i] slice_io_dicts[i]["up2"] = ua2[i] slice_io_dicts[i]["stages_executed"] = 2 batch_io_dict = { "stages_executed": 2, } # We have slices to exec through stage 3 # batch chosen slices batch_io_dict["stage2"] = torch.cat( [slice_io_dicts[s]["stage2"] for s in stg3_slices]) #batch_io_dict["up2"] = torch.cat( # [slice_io_dicts[s]["up2"] for s in stg3_slices]) self.measure_time_start("RPN-stage-3") self._net.forward_rpn_stage(batch_io_dict) self.measure_time_end(f"RPN-stage-3") # Scatter the results up3_chunks = torch.chunk(batch_io_dict["up3"], len(stg3_slices)) for i, s in enumerate(stg3_slices): slice_io_dicts[s]["up3"] = up3_chunks[i] slice_io_dicts[s]["stages_executed"] = 3 stg_seq.extend([3] * len(stg3_slices)) self.measure_time_start("RPN-finalize") if not data_sliced: # No slicing were used if io_dict['stages_executed'] == 1: self._net.forward_rpn_rem_preds(io_dict) else: self._net.forward_rpn_all_preds(io_dict) preds_dict = io_dict else: # We used slicing, now we need to merge the slices # After detection heads # This part can be batched too but it is okay to # stay like this # Another optimization could be using cuda streams for io_d in slice_io_dicts: if io_d["stages_executed"] == 1: # stage 1 slices already has cls preds io_d['backbone_out'] = io_d['backbone_out'].contiguous() self._net.forward_rpn_rem_preds(io_d) else: self._net.forward_rpn_all_preds(io_d) if self._merge_preds: # if two overlapped regions went through same number of # stages, get half of the overlapped region from each # neighboor io dict # Otherwise, select the one with more stages executed preds_dict = {} for k, v in self._pred_dict_copy.items(): preds_dict[k] = v.clone().detach() # every slice has a big middle range and two (or one) # small overlap ranges slc_r = self._cls_scr_ranges[0] for k in preds_dict.keys(): preds_dict[k][..., :slc_r[1], :] = \ slice_io_dicts[0][k][..., :slc_r[1], :] for i in range(len(slice_io_dicts) - 1): io_d1, io_d2 = slice_io_dicts[i], slice_io_dicts[i + 1] se1, se2 = io_d1["stages_executed"], io_d2[ "stages_executed"] ovl_r = self._cls_scr_ranges[i * 2 + 1] ovl_len = ovl_r[1] - ovl_r[0] for k in preds_dict.keys(): if se1 > se2: preds_dict[k][..., ovl_r[0]:ovl_r[1], :] = \ io_d1[k][..., -ovl_len:, :] elif se1 < se2: preds_dict[k][..., ovl_r[0]:ovl_r[1], :] = \ io_d2[k][..., :ovl_len, :] else: mid = ovl_len // 2 preds_dict[k][..., ovl_r[0]:(ovl_r[0]+mid), :] = \ io_d1[k][..., -ovl_len:(-ovl_len+mid), :] preds_dict[k][..., (ovl_r[0]+mid):ovl_r[1], :] = \ io_d2[k][..., mid:ovl_len, :] slc_r = self._cls_scr_ranges[i * 2 + 2] slc_len = slc_r[1] - slc_r[0] preds_dict[k][..., slc_r[0]:slc_r[1], :] = \ io_d2[k][..., ovl_len:(ovl_len+slc_len) , :] for k, v in preds_dict.items(): preds_dict[k] = v.contiguous() self.measure_time_end("RPN-finalize") #self.measure_time_end("RPN-total") # ASSUME BATCH SIZE 1 # Predict has high execution time variance, I wonder why # IDEA: Use anchor mask to predict prediction time # actually, I can just use number of pillars as well self.measure_time_start('Predict') torch.backends.cudnn.benchmark = False if self._merge_preds: # DEBUG #self.plot_amask_and_save(example['anchors_mask'], f"merged_{self._sample_idx}") #self.plot_cls_scores_and_save(preds_dict['cls_preds'], example['anchors_mask'], # f"merged_{self._sample_idx}") # DEBUG END det = self._net.predict(example, preds_dict) else: # I can use batching for prediction # Exclude stage 1 slices having class score sum of 0 selected_slices = [] for i, s in enumerate(slice_io_dicts): if s['stages_executed'] > 1 or torch.sum(s['cls_scores']) > .0: selected_slices.append(s) det = self.create_empty_det_dict(example['metadata'][0]) if len(selected_slices) > 0: batch_pred_dict = {} for k in self._pred_dict_copy.keys(): batch_pred_dict[k] = torch.cat( [s[k] for s in selected_slices]) for k in ['anchors', 'anchors_mask']: example[k] = torch.cat([s[k] for s in selected_slices]) example['metadata'] = [] slice_dets = self._net.predict(example, batch_pred_dict) # remove slices that has no detections slice_dets_final = [] for sd in slice_dets: if sd['box3d_lidar'].shape[0] > 0: slice_dets_final.append(sd) if len(slice_dets_final) > 0: # merge final slice detections for k in det.keys(): if k != 'metadata': det[k] = torch.cat( [d[k] for d in slice_dets_final]) #print('3D bounding boxes before:') #for box in det['box3d_lidar']: # print(box) # Now we need to remove duplicated overlapped predictions # if they exist. We have to do it because we executed NMS # twice on overlapped regions mask_indexes = [] centers = det['box3d_lidar'][:, :2].cpu() scores = det['scores'].cpu() for i in range(centers.shape[0]): diffs = torch.linalg.norm(centers - centers[i], dim=1) sel = True for j, d in enumerate(diffs): if d > 0 and d < 2. and scores[i] < scores[j]: # distance below 2 meter threshold sel = False print(f"Discard 3d bbox at", centers[i], 'in image', det['metadata']['image_idx']) break if sel: mask_indexes.append(i) for k, v in det.items(): if k != 'metadata': det[k] = det[k][mask_indexes] det = [det] # batch size 1 torch.backends.cudnn.benchmark = self._cudnn_benchmarking self.measure_time_end('Predict') torch.cuda.synchronize() self.measure_time_end('Post-stage-1', False) return det, stg_seq, num_voxels
def no_slicing_forward(self, dataset_iter, deadline): self.measure_time_start('Pre-stage-1') self.measure_time_start('PFE') #self.measure_time_start('PillarGen') try: if self._repeat_example: example = merge_second_batch( [self._data_loader.dataset[self._repeat_example_idx]]) else: example = next(dataset_iter) except StopIteration: print("Woaaaah, that is unexpected! Check dataset iter!") return None, None, None num_voxels = example["num_voxels"][0][0] # batch size 1 if self._method == 3: # imprecise #num_stgs = self.num_stages_to_exec(deadline, num_voxels) print('ERROR! imprecise no slice is not being supported') else: num_stgs = self._method + 1 example = example_convert_to_torch(example, self._float_dtype) torch.backends.cudnn.benchmark = False io_dict = self._net.forward_pfn(example) torch.backends.cudnn.benchmark = self._cudnn_benchmarking # Calculate anchor mask stg0_sum = torch.sum(io_dict['stage0'], 1, keepdim=True) sum_mask = torch.nn.functional.max_pool2d( stg0_sum, 15, stride=int(self._stg0_pred_scale_rate), padding=7).type(torch.bool) example['anchors_mask'] = sum_mask.expand( self._box_preds_size[:-1]).contiguous() self.measure_time_end('PFE') #self.measure_time_start('RPN-total') self.measure_time_start('RPN-stage-1') self._net.forward_rpn_stage(io_dict) self.measure_time_end('RPN-stage-1') stg_seq = [1] self.measure_time_end('Pre-stage-1') self.measure_time_start('Post-stage-1') if num_stgs >= 2: self.measure_time_start('RPN-stage-2') self._net.forward_rpn_stage(io_dict) self.measure_time_end('RPN-stage-2') stg_seq.append(2) if num_stgs == 3: self.measure_time_start('RPN-stage-3') self._net.forward_rpn_stage(io_dict) self.measure_time_end('RPN-stage-3') stg_seq.append(3) self.measure_time_start('RPN-finalize') self._net.forward_rpn_all_preds(io_dict) self.measure_time_end('RPN-finalize') #self.measure_time_end('RPN-total') self.measure_time_start('Predict') #torch.cuda.nvtx.range_push('Predict') torch.backends.cudnn.benchmark = False det = self._net.predict(example, io_dict) torch.backends.cudnn.benchmark = self._cudnn_benchmarking #torch.cuda.nvtx.range_pop() self.measure_time_end('Predict') torch.cuda.synchronize() self.measure_time_end('Post-stage-1') return det, stg_seq, num_voxels