class SingleStageDetector: def __init__(self, batch_size, is_training): self.batch_size = batch_size self.is_training = is_training # placeholders self.placeholders_builder = PlaceHolders(self.batch_size) self.placeholders_builder.get_placeholders() self.placeholders = self.placeholders_builder.placeholders self.cls_list = cfg.DATASET.KITTI.CLS_LIST self.cls2idx = dict([(cls, i + 1) for i, cls in enumerate(self.cls_list)]) self.idx2cls = dict([(i + 1, cls) for i, cls in enumerate(self.cls_list)]) # anchor_builder self.anchor_builder = Anchors(0, self.cls_list) # encoder_decoder self.encoder_decoder = EncoderDecoder(0) # postprocessor self.postprocessor = PostProcessor(0, len(self.cls_list)) # loss builder self.loss_builder = LossBuilder(0) self.corner_loss = cfg.MODEL.FIRST_STAGE.CORNER_LOSS # head builder self.iou_loss = False self.heads = [] head_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.HEAD for i in range(len(head_cfg)): self.heads.append(HeadBuilder(self.batch_size, self.anchor_builder.anchors_num, 0, head_cfg[i], is_training)) if self.heads[-1].layer_type == 'IoU': self.iou_loss = True # target assigner self.target_assigner = TargetAssigner(0) # first stage self.vote_loss = False # layer builder layer_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.ARCHITECTURE layers = [] for i in range(len(layer_cfg)): layers.append(LayerBuilder(i, self.is_training, layer_cfg)) if layers[-1].layer_type == 'Vote_Layer': self.vote_loss = True self.layers = layers self.attr_velo_loss = cfg.MODEL.FIRST_STAGE.PREDICT_ATTRIBUTE_AND_VELOCITY self.__init_dict() def __init_dict(self): self.output = dict() # sampled xyz/feature self.output[maps_dict.KEY_OUTPUT_XYZ] = [] self.output[maps_dict.KEY_OUTPUT_FEATURE] = [] # generated anchors self.output[maps_dict.KEY_ANCHORS_3D] = [] # generated anchors # vote output self.output[maps_dict.PRED_VOTE_OFFSET] = [] self.output[maps_dict.PRED_VOTE_BASE] = [] # det output self.output[maps_dict.PRED_CLS] = [] self.output[maps_dict.PRED_OFFSET] = [] self.output[maps_dict.PRED_ANGLE_CLS] = [] self.output[maps_dict.PRED_ANGLE_RES] = [] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS] = [] self.output[maps_dict.PRED_ATTRIBUTE] = [] self.output[maps_dict.PRED_VELOCITY] = [] # iou output self.output[maps_dict.PRED_IOU_3D_VALUE] = [] # final result self.output[maps_dict.PRED_3D_BBOX] = [] self.output[maps_dict.PRED_3D_SCORE] = [] self.output[maps_dict.PRED_3D_CLS_CATEGORY] = [] self.output[maps_dict.PRED_3D_ATTRIBUTE] = [] self.output[maps_dict.PRED_3D_VELOCITY] = [] self.prediction_keys = self.output.keys() self.labels = dict() self.labels[maps_dict.GT_CLS] = [] self.labels[maps_dict.GT_OFFSET] = [] self.labels[maps_dict.GT_ANGLE_CLS] = [] self.labels[maps_dict.GT_ANGLE_RES] = [] self.labels[maps_dict.GT_ATTRIBUTE] = [] self.labels[maps_dict.GT_VELOCITY] = [] self.labels[maps_dict.GT_BOXES_ANCHORS_3D] = [] self.labels[maps_dict.GT_IOU_3D_VALUE] = [] self.labels[maps_dict.GT_PMASK] = [] self.labels[maps_dict.GT_NMASK] = [] self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS] = [] def build_img_extractor(self, img_input): self._img_pixel_size = np.asarray([360, 1200]) VGG_config = namedtuple('VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay') self._img_feature_extractor = ImgVggPyr(VGG_config(**{ 'vgg_conv1': [2, 32], 'vgg_conv2': [2, 64], 'vgg_conv3': [3, 128], 'vgg_conv4': [3, 256], 'l2_weight_decay': 0.0005 })) self._img_preprocessed = \ self._img_feature_extractor.preprocess_input(img_input, self._img_pixel_size) # self._img_preprocessed = img_input self.img_feature_maps, self.img_end_points = \ self._img_feature_extractor.build( self._img_preprocessed, self._img_pixel_size, self.is_training) #return self.img_feature_maps self.img_bottleneck = slim.conv2d( self.img_feature_maps, 128, [1, 1], #2, [1, 1], scope='bottleneck', normalizer_fn=slim.batch_norm, #normalizer_fn=None, normalizer_params={ 'is_training': self.is_training}) return self.img_bottleneck def network_forward(self, point_cloud, bn_decay, img_input): l0_xyz = tf.slice(point_cloud, [0,0,0], [-1,-1,3]) l0_points = tf.slice(point_cloud, [0,0,3], [-1,-1,-1]) num_point = l0_xyz.get_shape().as_list()[1] img_feature_maps = self.build_img_extractor(img_input) pts2d = projection.tf_rect_to_image(tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]), self.placeholders[maps_dict.PL_CALIB_P2]) pts2d = tf.cast(pts2d, tf.int32) # (B,N,2) indices = tf.concat([ tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [num_point]), axis=-1), # (B*N, 1) tf.reshape(pts2d, [self.batch_size * num_point, 2]) ], axis=-1) # (B*N,3) indices = tf.gather(indices, [0, 2, 1], axis=-1) # image's shape is (y,x) point_img_feats = tf.reshape(tf.gather_nd(img_feature_maps, indices), # (B*N,C) [self.batch_size, num_point, -1]) # (B,N,C) xyz_list, feature_list, fps_idx_list, point_img_feats_list = [l0_xyz], [l0_points], [None], [point_img_feats] for layer in self.layers: xyz_list, feature_list, fps_idx_list, point_img_feats_list = layer.build_layer(xyz_list, feature_list, fps_idx_list, bn_decay, self.output, point_img_feats_list) cur_head_start_idx = len(self.output[maps_dict.KEY_OUTPUT_XYZ]) for head in self.heads: head.build_layer(xyz_list, feature_list, bn_decay, self.output) merge_head_prediction(cur_head_start_idx, self.output, self.prediction_keys) def model_forward(self, bn_decay=None): points_input_det = self.placeholders[maps_dict.PL_POINTS_INPUT] img_input_det = self.placeholders[maps_dict.PL_IMG_INPUT] # forward the point cloud self.network_forward(points_input_det, bn_decay, img_input_det) # generate anchors base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][-1] anchors = self.anchor_builder.generate(base_xyz) # [bs, pts_num, 1/cls_num, 7] self.output[maps_dict.KEY_ANCHORS_3D].append(anchors) if self.is_training: # training mode self.train_forward(-1, anchors) else: # testing mode self.test_forward(-1, anchors) def train_forward(self, index, anchors): """ Calculating loss """ base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] gt_boxes_3d = self.placeholders[maps_dict.PL_LABEL_BOXES_3D] gt_classes = self.placeholders[maps_dict.PL_LABEL_CLASSES] gt_angle_cls = self.placeholders[maps_dict.PL_ANGLE_CLS] gt_angle_res = self.placeholders[maps_dict.PL_ANGLE_RESIDUAL] if maps_dict.PL_LABEL_ATTRIBUTES in self.placeholders.keys(): gt_attributes = self.placeholders[maps_dict.PL_LABEL_ATTRIBUTES] else: gt_attributes = None if maps_dict.PL_LABEL_VELOCITY in self.placeholders.keys(): gt_velocity = self.placeholders[maps_dict.PL_LABEL_VELOCITY] else: gt_velocity = None returned_list = self.target_assigner.assign(base_xyz, anchors, gt_boxes_3d, gt_classes, gt_angle_cls, gt_angle_res, gt_velocity, gt_attributes) assigned_idx, assigned_pmask, assigned_nmask, assigned_gt_boxes_3d, assigned_gt_labels, assigned_gt_angle_cls, assigned_gt_angle_res, assigned_gt_velocity, assigned_gt_attribute = returned_list # encode offset assigned_gt_offset, assigned_gt_angle_cls, assigned_gt_angle_res = self.encoder_decoder.encode(base_xyz, assigned_gt_boxes_3d, anchors) # corner_loss corner_loss_angle_cls = tf.cast(tf.one_hot(assigned_gt_angle_cls, depth=cfg.MODEL.ANGLE_CLS_NUM, on_value=1, off_value=0, axis=-1), tf.float32) # bs, pts_num, cls_num, -1 pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, corner_loss_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] pred_corners = transfer_box3d_to_corners(pred_anchors_3d) # [bs, points_num, cls_num, 8, 3] gt_corners = transfer_box3d_to_corners(assigned_gt_boxes_3d) # [bs, points_num, cls_num,8,3] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS].append(pred_corners) self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS].append(gt_corners) self.labels[maps_dict.GT_CLS].append(assigned_gt_labels) self.labels[maps_dict.GT_BOXES_ANCHORS_3D].append(assigned_gt_boxes_3d) self.labels[maps_dict.GT_OFFSET].append(assigned_gt_offset) self.labels[maps_dict.GT_ANGLE_CLS].append(assigned_gt_angle_cls) self.labels[maps_dict.GT_ANGLE_RES].append(assigned_gt_angle_res) self.labels[maps_dict.GT_ATTRIBUTE].append(assigned_gt_attribute) self.labels[maps_dict.GT_VELOCITY].append(assigned_gt_velocity) self.labels[maps_dict.GT_PMASK].append(assigned_pmask) self.labels[maps_dict.GT_NMASK].append(assigned_nmask) self.loss_builder.forward(index, self.labels, self.output, self.placeholders, self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss) def test_forward(self, index, anchors): base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_cls = self.output[maps_dict.PRED_CLS][index] # [bs, points_num, cls_num + 1/0] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] # decode predictions pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, pred_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] # decode classification if cfg.MODEL.FIRST_STAGE.CLS_ACTIVATION == 'Softmax': # softmax pred_score = tf.nn.softmax(pred_cls) pred_score = tf.slice(pred_score, [0, 0, 1], [-1, -1, -1]) else: # sigmoid pred_score = tf.nn.sigmoid(pred_cls) # using IoU branch proposed by sparse-to-dense if self.iou_loss: pred_iou = self.output[maps_dict.PRED_IOU_3D_VALUE][index] pred_score = pred_score * pred_iou if len(self.output[maps_dict.PRED_ATTRIBUTE]) <= 0: pred_attribute = None else: pred_attribute = self.output[maps_dict.PRED_ATTRIBUTE][index] if len(self.output[maps_dict.PRED_VELOCITY]) <= 0: pred_velocity = None else: pred_velocity = self.output[maps_dict.PRED_VELOCITY][index] self.postprocessor.forward(pred_anchors_3d, pred_score, self.output, pred_attribute, pred_velocity)
class SingleStageDetector(tf.keras.Model): def __init__(self, batch_size, is_training): super(SingleStageDetector, self).__init__() self.batch_size = batch_size self.is_training = is_training # placeholders # self.placeholders_builder = PlaceHolders(self.batch_size) # self.placeholders_builder.get_placeholders() # self.placeholders = self.placeholders_builder.placeholders self.cls_list = cfg.DATASET.KITTI.CLS_LIST self.cls2idx = dict([(cls, i + 1) for i, cls in enumerate(self.cls_list)]) self.idx2cls = dict([(i + 1, cls) for i, cls in enumerate(self.cls_list)]) # anchor_builder self.anchor_builder = Anchors(0, self.cls_list) # encoder_decoder self.encoder_decoder = EncoderDecoder(0) # postprocessor self.postprocessor = PostProcessor(0, len(self.cls_list)) # loss builder self.loss_builder = LossBuilder(0) self.corner_loss = cfg.MODEL.FIRST_STAGE.CORNER_LOSS # head builder self.iou_loss = False self.heads = [] head_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.HEAD for i in range(len(head_cfg)): self.heads.append(HeadBuilder(self.batch_size, self.anchor_builder.anchors_num, 0, head_cfg[i], is_training)) if self.heads[-1].layer_type == 'IoU': self.iou_loss = True # target assigner self.target_assigner = TargetAssigner(0) # first stage self.vote_loss = False # layer builder layer_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.ARCHITECTURE layers = [] for i in range(len(layer_cfg)): layers.append(LayerBuilder(i, self.is_training, layer_cfg)) if layers[-1].layer_type == 'Vote_Layer': self.vote_loss = True self.layer_list = layers self.attr_velo_loss = cfg.MODEL.FIRST_STAGE.PREDICT_ATTRIBUTE_AND_VELOCITY self.__init_dict() def __init_dict(self): self.output_data = dict() # sampled xyz/feature self.output_data[maps_dict.KEY_OUTPUT_XYZ] = [] self.output_data[maps_dict.KEY_OUTPUT_FEATURE] = [] # generated anchors self.output_data[maps_dict.KEY_ANCHORS_3D] = [] # generated anchors # vote output self.output_data[maps_dict.PRED_VOTE_OFFSET] = [] self.output_data[maps_dict.PRED_VOTE_BASE] = [] # det output self.output_data[maps_dict.PRED_CLS] = [] self.output_data[maps_dict.PRED_OFFSET] = [] self.output_data[maps_dict.PRED_ANGLE_CLS] = [] self.output_data[maps_dict.PRED_ANGLE_RES] = [] self.output_data[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS] = [] self.output_data[maps_dict.PRED_ATTRIBUTE] = [] self.output_data[maps_dict.PRED_VELOCITY] = [] # iou output self.output_data[maps_dict.PRED_IOU_3D_VALUE] = [] # final result self.output_data[maps_dict.PRED_3D_BBOX] = [] self.output_data[maps_dict.PRED_3D_SCORE] = [] self.output_data[maps_dict.PRED_3D_CLS_CATEGORY] = [] self.output_data[maps_dict.PRED_3D_ATTRIBUTE] = [] self.output_data[maps_dict.PRED_3D_VELOCITY] = [] self.prediction_keys = self.output_data.keys() self.labels = dict() self.labels[maps_dict.GT_CLS] = [] self.labels[maps_dict.GT_OFFSET] = [] self.labels[maps_dict.GT_ANGLE_CLS] = [] self.labels[maps_dict.GT_ANGLE_RES] = [] self.labels[maps_dict.GT_ATTRIBUTE] = [] self.labels[maps_dict.GT_VELOCITY] = [] self.labels[maps_dict.GT_BOXES_ANCHORS_3D] = [] self.labels[maps_dict.GT_IOU_3D_VALUE] = [] self.labels[maps_dict.GT_PMASK] = [] self.labels[maps_dict.GT_NMASK] = [] self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS] = [] def network_forward(self, point_cloud, bn_decay): l0_xyz = tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]) l0_points = tf.slice(point_cloud, [0, 0, 3], [-1, -1, -1]) xyz_list, feature_list, fps_idx_list = [l0_xyz], [l0_points], [None] # for layer in self.layers: # xyz_list, feature_list, fps_idx_list = layer.build_layer( # xyz_list, feature_list, fps_idx_list, bn_decay, self.output_data) for layer in self.layer_list: xyz_list, feature_list, fps_idx_list = layer.build_layer( xyz_list, feature_list, fps_idx_list, bn_decay, self.output_data) cur_head_start_idx = len(self.output_data[maps_dict.KEY_OUTPUT_XYZ]) for head in self.heads: head.build_layer(xyz_list, feature_list, bn_decay, self.output_data) merge_head_prediction(cur_head_start_idx, self.output_data, self.prediction_keys) def model_forward(self, data, bn_decay=None): self.__init_dict() # points_input_det = self.placeholders[maps_dict.PL_POINTS_INPUT] points_input_det = data[maps_dict.PL_POINTS_INPUT] # forward the point cloud self.network_forward(points_input_det, bn_decay) # generate anchors base_xyz = self.output_data[maps_dict.KEY_OUTPUT_XYZ][-1] anchors = self.anchor_builder.generate( base_xyz) # [bs, pts_num, 1/cls_num, 7] self.output_data[maps_dict.KEY_ANCHORS_3D].append(anchors) if self.is_training: # training mode loss = self.train_forward(-1, anchors, data) return loss else: # testing mode output = self.test_forward(-1, anchors, data) return output def train_forward(self, index, anchors, data): """ Calculating loss """ base_xyz = self.output_data[maps_dict.KEY_OUTPUT_XYZ][index] pred_offset = self.output_data[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output_data[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output_data[maps_dict.PRED_ANGLE_RES][index] # gt_boxes_3d = self.placeholders[maps_dict.PL_LABEL_BOXES_3D] # gt_classes = self.placeholders[maps_dict.PL_LABEL_CLASSES] # gt_angle_cls = self.placeholders[maps_dict.PL_ANGLE_CLS] # gt_angle_res = self.placeholders[maps_dict.PL_ANGLE_RESIDUAL] gt_boxes_3d = data[maps_dict.PL_LABEL_BOXES_3D] gt_classes = data[maps_dict.PL_LABEL_CLASSES] gt_angle_cls = data[maps_dict.PL_ANGLE_CLS] gt_angle_res = data[maps_dict.PL_ANGLE_RESIDUAL] # if maps_dict.PL_LABEL_ATTRIBUTES in self.placeholders.keys(): # gt_attributes = self.placeholders[maps_dict.PL_LABEL_ATTRIBUTES] if maps_dict.PL_LABEL_ATTRIBUTES in data.keys(): gt_attributes = data[maps_dict.PL_LABEL_ATTRIBUTES] else: gt_attributes = None # if maps_dict.PL_LABEL_VELOCITY in self.placeholders.keys(): # gt_velocity = self.placeholders[maps_dict.PL_LABEL_VELOCITY] if maps_dict.PL_LABEL_VELOCITY in data.keys(): gt_velocity = data[maps_dict.PL_LABEL_VELOCITY] else: gt_velocity = None returned_list = self.target_assigner.assign( base_xyz, anchors, gt_boxes_3d, gt_classes, gt_angle_cls, gt_angle_res, gt_velocity, gt_attributes) assigned_idx, assigned_pmask, assigned_nmask, assigned_gt_boxes_3d, assigned_gt_labels, assigned_gt_angle_cls, assigned_gt_angle_res, assigned_gt_velocity, assigned_gt_attribute = returned_list # encode offset assigned_gt_offset, assigned_gt_angle_cls, assigned_gt_angle_res = self.encoder_decoder.encode( base_xyz, assigned_gt_boxes_3d, anchors) # corner_loss corner_loss_angle_cls = tf.cast(tf.one_hot(assigned_gt_angle_cls, depth=cfg.MODEL.ANGLE_CLS_NUM, on_value=1, off_value=0, axis=-1), tf.float32) # bs, pts_num, cls_num, -1 pred_anchors_3d = self.encoder_decoder.decode( base_xyz, pred_offset, corner_loss_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] pred_corners = transfer_box3d_to_corners( pred_anchors_3d) # [bs, points_num, cls_num, 8, 3] gt_corners = transfer_box3d_to_corners( assigned_gt_boxes_3d) # [bs, points_num, cls_num,8,3] self.output_data[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS].append( pred_corners) self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS].append(gt_corners) self.labels[maps_dict.GT_CLS].append(assigned_gt_labels) self.labels[maps_dict.GT_BOXES_ANCHORS_3D].append(assigned_gt_boxes_3d) self.labels[maps_dict.GT_OFFSET].append(assigned_gt_offset) self.labels[maps_dict.GT_ANGLE_CLS].append(assigned_gt_angle_cls) self.labels[maps_dict.GT_ANGLE_RES].append(assigned_gt_angle_res) self.labels[maps_dict.GT_ATTRIBUTE].append(assigned_gt_attribute) self.labels[maps_dict.GT_VELOCITY].append(assigned_gt_velocity) self.labels[maps_dict.GT_PMASK].append(assigned_pmask) self.labels[maps_dict.GT_NMASK].append(assigned_nmask) # self.loss_builder.forward(index, self.labels, self.output_data, self.placeholders, # self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss) loss = self.loss_builder.forward(index, self.labels, self.output_data, data, self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss) return loss def test_forward(self, index, anchors, data): base_xyz = self.output_data[maps_dict.KEY_OUTPUT_XYZ][index] # [bs, points_num, cls_num + 1/0] pred_cls = self.output_data[maps_dict.PRED_CLS][index] pred_offset = self.output_data[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output_data[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output_data[maps_dict.PRED_ANGLE_RES][index] # decode predictions pred_anchors_3d = self.encoder_decoder.decode( base_xyz, pred_offset, pred_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] # decode classification if cfg.MODEL.FIRST_STAGE.CLS_ACTIVATION == 'Softmax': # softmax pred_score = tf.nn.softmax(pred_cls) pred_score = tf.slice(pred_score, [0, 0, 1], [-1, -1, -1]) else: # sigmoid pred_score = tf.nn.sigmoid(pred_cls) # using IoU branch proposed by sparse-to-dense if self.iou_loss: pred_iou = self.output_data[maps_dict.PRED_IOU_3D_VALUE][index] pred_score = pred_score * pred_iou if len(self.output_data[maps_dict.PRED_ATTRIBUTE]) <= 0: pred_attribute = None else: pred_attribute = self.output_data[maps_dict.PRED_ATTRIBUTE][index] if len(self.output_data[maps_dict.PRED_VELOCITY]) <= 0: pred_velocity = None else: pred_velocity = self.output_data[maps_dict.PRED_VELOCITY][index] return self.postprocessor.forward( pred_anchors_3d, pred_score, self.output_data, pred_attribute, pred_velocity)
class SingleStageDetector: def __init__(self, batch_size, is_training): self.batch_size = batch_size self.is_training = is_training # placeholders self.placeholders_builder = PlaceHolders(self.batch_size) self.placeholders_builder.get_placeholders() self.placeholders = self.placeholders_builder.placeholders self.cls_list = cfg.DATASET.KITTI.CLS_LIST self.cls2idx = dict([(cls, i + 1) for i, cls in enumerate(self.cls_list)]) self.idx2cls = dict([(i + 1, cls) for i, cls in enumerate(self.cls_list)]) # anchor_builder self.anchor_builder = Anchors(0, self.cls_list) # encoder_decoder self.encoder_decoder = EncoderDecoder(0) # postprocessor self.postprocessor = PostProcessor(0, len(self.cls_list)) # loss builder self.loss_builder = LossBuilder(0) self.corner_loss = cfg.MODEL.FIRST_STAGE.CORNER_LOSS # head builder self.iou_loss = False self.heads = [] head_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.HEAD for i in range(len(head_cfg)): self.heads.append( HeadBuilder(self.batch_size, self.anchor_builder.anchors_num, 0, head_cfg[i], is_training)) if self.heads[-1].layer_type == 'IoU': self.iou_loss = True # target assigner self.target_assigner = TargetAssigner(0) # first stage self.vote_loss = False # layer builder layer_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.ARCHITECTURE layers = [] for i in range(len(layer_cfg)): layers.append(LayerBuilder(i, self.is_training, layer_cfg)) if layers[-1].layer_type == 'Vote_Layer': self.vote_loss = True self.layers = layers self.attr_velo_loss = cfg.MODEL.FIRST_STAGE.PREDICT_ATTRIBUTE_AND_VELOCITY self.__init_dict() def __init_dict(self): self.output = dict() # sampled xyz/feature self.output[maps_dict.KEY_OUTPUT_XYZ] = [] self.output[maps_dict.KEY_OUTPUT_FEATURE] = [] # generated anchors self.output[maps_dict.KEY_ANCHORS_3D] = [] # generated anchors # vote output self.output[maps_dict.PRED_VOTE_OFFSET] = [] self.output[maps_dict.PRED_VOTE_BASE] = [] # det output self.output[maps_dict.PRED_CLS] = [] self.output[maps_dict.PRED_OFFSET] = [] self.output[maps_dict.PRED_ANGLE_CLS] = [] self.output[maps_dict.PRED_ANGLE_RES] = [] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS] = [] self.output[maps_dict.PRED_ATTRIBUTE] = [] self.output[maps_dict.PRED_VELOCITY] = [] # iou output self.output[maps_dict.PRED_IOU_3D_VALUE] = [] # final result self.output[maps_dict.PRED_3D_BBOX] = [] self.output[maps_dict.PRED_3D_SCORE] = [] self.output[maps_dict.PRED_3D_CLS_CATEGORY] = [] self.output[maps_dict.PRED_3D_ATTRIBUTE] = [] self.output[maps_dict.PRED_3D_VELOCITY] = [] self.output[maps_dict.PRED_POINT_SEG] = [] self.prediction_keys = self.output.keys() self.labels = dict() self.labels[maps_dict.GT_CLS] = [] self.labels[maps_dict.GT_OFFSET] = [] self.labels[maps_dict.GT_ANGLE_CLS] = [] self.labels[maps_dict.GT_ANGLE_RES] = [] self.labels[maps_dict.GT_ATTRIBUTE] = [] self.labels[maps_dict.GT_VELOCITY] = [] self.labels[maps_dict.GT_BOXES_ANCHORS_3D] = [] self.labels[maps_dict.GT_IOU_3D_VALUE] = [] self.labels[maps_dict.GT_PMASK] = [] self.labels[maps_dict.GT_NMASK] = [] self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS] = [] self.labels[maps_dict.PL_LABEL_SEMSEGS] = [] def network_forward(self, point_cloud, bn_decay, img_input, img_full_seg): l0_xyz = tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]) l0_points = tf.slice(point_cloud, [0, 0, 3], [-1, -1, -1]) num_point = l0_xyz.get_shape().as_list()[1] batch_size = l0_xyz.get_shape().as_list()[0] img_full_seg = tf.reshape(img_full_seg, [batch_size, 360, 1200, 1]) pts2d = projection.tf_rect_to_image( tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]), self.placeholders[maps_dict.PL_CALIB_P2]) pts2d = tf.cast(pts2d, tf.int32) # (B,N,2) indices = tf.concat( [ tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [num_point]), axis=-1), # (B*N, 1) tf.reshape(pts2d, [self.batch_size * num_point, 2]) ], axis=-1) # (B*N,3) indices = tf.gather(indices, [0, 2, 1], axis=-1) # image's shape is (y,x) img_full_seg = tf.reshape( tf.gather_nd(img_full_seg, indices), # (B*N,C) [self.batch_size, num_point, -1]) # (B,N,C) nsamples = 256 img_seg_npoints = 256 pooling_size = [] if self.cls_list[0] == 'Car': cls_int = 1 pooling_size = [5.0, 1.7, 5.0] elif self.cls_list[0] == 'Pedestrian': cls_int = 2 pooling_size = [1.2, 1.8, 1.2] elif self.cls_list[0] == 'Cyclist': cls_int = 3 pooling_size = [1.8, 1.8, 1.8] mask = tf.equal(img_full_seg, cls_int) mask = tf.reshape(mask, [self.batch_size, num_point]) img_seg_masked, indices = tf_gather_object_pc(img_full_seg, mask, npoints=img_seg_npoints) img_seg_masked.set_shape([batch_size, img_seg_npoints, 1]) img_seg_point_cloud = tf.gather_nd(l0_xyz, indices) img_seg_point_cloud.set_shape([batch_size, img_seg_npoints, 3]) img_input = tf.image.resize_images( img_input, [360, 1200], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR, align_corners=True) xyz_list, feature_list, fps_idx_list = [l0_xyz], [l0_points], [None] point_seg_net = None for layer in self.layers: if layer.layer_type == 'Vote_Layer': l3_points = pointnet_fp_module(xyz_list[2], xyz_list[4], feature_list[2], feature_list[4], [256], layer.is_training, bn_decay, scope='fa_layer1') l2_points = pointnet_fp_module(xyz_list[1], xyz_list[2], feature_list[1], l3_points, [256], layer.is_training, bn_decay, scope='fa_layer2') l1_points = pointnet_fp_module(xyz_list[0], xyz_list[1], feature_list[0], l2_points, [256], layer.is_training, bn_decay, scope='fa_layer3') # net = tf_util.conv1d(l1_points, 128, 1, padding='VALID', bn=True, # is_training=layer.is_training, scope='img-seg-conv1d-fc1', bn_decay=bn_decay) # net = tf_util.dropout(net, keep_prob=0.7, is_training=layer.is_training, scope='img-seg-dp1') # logits = tf_util.conv1d(net, 2, 1, padding='VALID', activation_fn=None, scope='img-seg-conv1d-fc2') # self.output[maps_dict.PRED_POINT_SEG].append(logits) point_seg_net = tf.gather_nd(l1_points, indices) point_seg_net.set_shape([batch_size, img_seg_npoints, 256]) xyz_list, feature_list, fps_idx_list = layer.build_layer( xyz_list, feature_list, fps_idx_list, bn_decay, self.output, self.placeholders[maps_dict.PL_CALIB_P2], img_input, img_seg_point_cloud, point_seg_net, pooling_size) cur_head_start_idx = len(self.output[maps_dict.KEY_OUTPUT_XYZ]) for head in self.heads: head.build_layer(xyz_list, feature_list, bn_decay, self.output) merge_head_prediction(cur_head_start_idx, self.output, self.prediction_keys) def model_forward(self, bn_decay=None): points_input_det = self.placeholders[maps_dict.PL_POINTS_INPUT] img_input_det = self.placeholders[maps_dict.PL_IMG_INPUT] img_full_seg = self.placeholders[maps_dict.PL_IMG_FULL_SEG_INPUT] # forward the point cloud self.network_forward(points_input_det, bn_decay, img_input_det, img_full_seg) # generate anchors base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][-1] anchors = self.anchor_builder.generate( base_xyz) # [bs, pts_num, 1/cls_num, 7] self.output[maps_dict.KEY_ANCHORS_3D].append(anchors) if self.is_training: # training mode self.train_forward(-1, anchors) else: # testing mode self.test_forward(-1, anchors) def train_forward(self, index, anchors): """ Calculating loss """ base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] gt_boxes_3d = self.placeholders[maps_dict.PL_LABEL_BOXES_3D] gt_classes = self.placeholders[maps_dict.PL_LABEL_CLASSES] gt_angle_cls = self.placeholders[maps_dict.PL_ANGLE_CLS] gt_angle_res = self.placeholders[maps_dict.PL_ANGLE_RESIDUAL] # point_sem_labels = self.placeholders[maps_dict.PL_LABEL_SEMSEGS] if maps_dict.PL_LABEL_ATTRIBUTES in self.placeholders.keys(): gt_attributes = self.placeholders[maps_dict.PL_LABEL_ATTRIBUTES] else: gt_attributes = None if maps_dict.PL_LABEL_VELOCITY in self.placeholders.keys(): gt_velocity = self.placeholders[maps_dict.PL_LABEL_VELOCITY] else: gt_velocity = None returned_list = self.target_assigner.assign(base_xyz, anchors, gt_boxes_3d, gt_classes, gt_angle_cls, gt_angle_res, gt_velocity, gt_attributes) assigned_idx, assigned_pmask, assigned_nmask, assigned_gt_boxes_3d, assigned_gt_labels, assigned_gt_angle_cls, assigned_gt_angle_res, assigned_gt_velocity, assigned_gt_attribute = returned_list # encode offset assigned_gt_offset, assigned_gt_angle_cls, assigned_gt_angle_res = self.encoder_decoder.encode( base_xyz, assigned_gt_boxes_3d, anchors) # corner_loss corner_loss_angle_cls = tf.cast( tf.one_hot(assigned_gt_angle_cls, depth=cfg.MODEL.ANGLE_CLS_NUM, on_value=1, off_value=0, axis=-1), tf.float32) # bs, pts_num, cls_num, -1 pred_anchors_3d = self.encoder_decoder.decode( base_xyz, pred_offset, corner_loss_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] pred_corners = transfer_box3d_to_corners( pred_anchors_3d) # [bs, points_num, cls_num, 8, 3] gt_corners = transfer_box3d_to_corners( assigned_gt_boxes_3d) # [bs, points_num, cls_num,8,3] self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS].append( pred_corners) self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS].append(gt_corners) self.labels[maps_dict.GT_CLS].append(assigned_gt_labels) self.labels[maps_dict.GT_BOXES_ANCHORS_3D].append(assigned_gt_boxes_3d) self.labels[maps_dict.GT_OFFSET].append(assigned_gt_offset) self.labels[maps_dict.GT_ANGLE_CLS].append(assigned_gt_angle_cls) self.labels[maps_dict.GT_ANGLE_RES].append(assigned_gt_angle_res) self.labels[maps_dict.GT_ATTRIBUTE].append(assigned_gt_attribute) self.labels[maps_dict.GT_VELOCITY].append(assigned_gt_velocity) self.labels[maps_dict.GT_PMASK].append(assigned_pmask) self.labels[maps_dict.GT_NMASK].append(assigned_nmask) # self.labels[maps_dict.PL_LABEL_SEMSEGS].append(point_sem_labels) self.loss_builder.forward(index, self.labels, self.output, self.placeholders, self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss) def test_forward(self, index, anchors): base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index] pred_cls = self.output[maps_dict.PRED_CLS][ index] # [bs, points_num, cls_num + 1/0] pred_offset = self.output[maps_dict.PRED_OFFSET][index] pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index] pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index] # decode predictions pred_anchors_3d = self.encoder_decoder.decode( base_xyz, pred_offset, pred_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7] # decode classification if cfg.MODEL.FIRST_STAGE.CLS_ACTIVATION == 'Softmax': # softmax pred_score = tf.nn.softmax(pred_cls) pred_score = tf.slice(pred_score, [0, 0, 1], [-1, -1, -1]) else: # sigmoid pred_score = tf.nn.sigmoid(pred_cls) # using IoU branch proposed by sparse-to-dense if self.iou_loss: pred_iou = self.output[maps_dict.PRED_IOU_3D_VALUE][index] pred_score = pred_score * pred_iou if len(self.output[maps_dict.PRED_ATTRIBUTE]) <= 0: pred_attribute = None else: pred_attribute = self.output[maps_dict.PRED_ATTRIBUTE][index] if len(self.output[maps_dict.PRED_VELOCITY]) <= 0: pred_velocity = None else: pred_velocity = self.output[maps_dict.PRED_VELOCITY][index] self.postprocessor.forward(pred_anchors_3d, pred_score, self.output, pred_attribute, pred_velocity)