Exemple #1
0
class RCNN(object):
    def __init__(self, batch_size, num_point, num_channel=133, bn_decay=None, is_training=True):
        self.batch_size = batch_size
        self.num_point = num_point
        self.num_channel = num_channel
        self.bn_decay = bn_decay
        self.is_training = is_training
        self.end_points = {}
        self.placeholders = self.get_placeholders()
        self.box_encoder = BoxEncoder(CENTER_SEARCH_RANGE, NUM_CENTER_BIN, HEADING_SEARCH_RANGE, NUM_HEADING_BIN)
        self.build()

    def get_placeholders(self):
        batch_size = self.batch_size
        num_point = self.num_point
        num_channel = self.num_channel
        return {
            'pointclouds': tf.placeholder(tf.float32, shape=(batch_size, num_point, num_channel)),
            'proposal_boxes': tf.placeholder(tf.float32, shape=(batch_size, 7)),
            'class_labels': tf.placeholder(tf.int32, shape=(batch_size,)),
            'center_bin_x_labels': tf.placeholder(tf.int32, shape=(batch_size,)),
            'center_bin_z_labels': tf.placeholder(tf.int32, shape=(batch_size,)),
            'center_x_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)),
            'center_z_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)),
            'center_y_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)),
            'heading_bin_labels': tf.placeholder(tf.int32, shape=(batch_size,)),
            'heading_res_labels': tf.placeholder(tf.float32, shape=(batch_size,)),
            'size_class_labels': tf.placeholder(tf.int32, shape=(batch_size,)),
            'size_res_labels': tf.placeholder(tf.float32, shape=(batch_size, 3)),
            'gt_box_of_prop': tf.placeholder(tf.float32, shape=(batch_size, 8, 3)),
            'img_inputs': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)),
            'calib': tf.placeholder(tf.float32, shape=(batch_size, 3, 4)),
            'train_regression': tf.placeholder(tf.bool, shape=(batch_size,)),
            'img_seg_map': tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 4)),
            'is_training_pl': tf.placeholder(tf.bool, shape=())
        }

    def build_img_extractor(self):
        self._img_pixel_size = np.asarray([360, 1200])
        VGG_config = namedtuple('VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay')
        self._img_feature_extractor = ImgVggPyr(VGG_config(**{
            'vgg_conv1': [2, 32],
            'vgg_conv2': [2, 64],
            'vgg_conv3': [3, 128],
            'vgg_conv4': [3, 256],
            'l2_weight_decay': 0.0005
        }))
        self._img_preprocessed = \
            self._img_feature_extractor.preprocess_input(
                self.placeholders['img_inputs'], self._img_pixel_size)
        self.img_feature_maps, self.img_end_points = \
            self._img_feature_extractor.build(
                self._img_preprocessed,
                self._img_pixel_size,
                self.is_training)
        #return self.img_feature_maps
        self.img_bottleneck = slim.conv2d(
            self.img_feature_maps,
            1, [1, 1],
            scope='bottleneck',
            normalizer_fn=slim.batch_norm,
            normalizer_params={
                'is_training': self.is_training})
        #tf.summary.image('img_feature', tf.reduce_max(self.img_bottleneck,axis=-1,keepdims=True),max_outputs=3)
        return self.img_bottleneck

    def build(self):
        point_cloud = self.placeholders['pointclouds']
        is_training = self.placeholders['is_training_pl']
        batch_size = self.batch_size
        # image
        '''
        img_bottleneck = self.build_img_extractor()
        box2d_corners, box2d_corners_norm = projection.tf_project_to_image_space(
            self.placeholders['proposal_boxes'],
            self.placeholders['calib'], self._img_pixel_size)
        img_rois = tf.image.crop_and_resize(
            img_bottleneck,
            box2d_corners_norm,
            tf.range(0, batch_size),
            [16,16])
        '''
        seg_softmax = self.placeholders['img_seg_map']
        seg_pred = tf.expand_dims(tf.argmax(seg_softmax, axis=-1), axis=-1)
        self._img_pixel_size = np.asarray([360, 1200])
        box2d_corners, box2d_corners_norm = projection.tf_project_to_image_space(
            self.placeholders['proposal_boxes'],
            self.placeholders['calib'], self._img_pixel_size)
        # y1, x1, y2, x2
        box2d_corners_norm_reorder = tf.stack([
            tf.gather(box2d_corners_norm, 1, axis=-1),
            tf.gather(box2d_corners_norm, 0, axis=-1),
            tf.gather(box2d_corners_norm, 3, axis=-1),
            tf.gather(box2d_corners_norm, 2, axis=-1),
        ], axis=-1)
        img_rois = tf.image.crop_and_resize(
            seg_softmax,
            #seg_pred,
            box2d_corners_norm_reorder,
            tf.range(0, batch_size),
            [16,16])
        self.end_points['img_rois'] = img_rois
        self.end_points['box2d_corners_norm_reorder'] = box2d_corners_norm_reorder

        l0_xyz = tf.slice(point_cloud, [0,0,0], [-1,-1,3])
        l0_points = tf.slice(point_cloud, [0,0,3], [-1,-1,self.num_channel-3])
        # Set abstraction layers
        l1_xyz, l1_points, _ = pointnet_sa_module(l0_xyz, l0_points,
            npoint=128, radius=0.2, nsample=64, mlp=[128,128,128],
            mlp2=None, group_all=False, is_training=is_training, bn_decay=self.bn_decay,
            scope='rcnn-sa1', bn=True)
        l2_xyz, l2_points, _ = pointnet_sa_module(l1_xyz, l1_points,
            npoint=64, radius=0.4, nsample=64, mlp=[128,128,256],
            mlp2=None, group_all=False, is_training=is_training, bn_decay=self.bn_decay,
            scope='rcnn-sa2', bn=True)
        l3_xyz, l3_points, _ = pointnet_sa_module(l2_xyz, l2_points,
            npoint=64, radius=0.4, nsample=64, mlp=[256,256,512],
            mlp2=None, group_all=True, is_training=is_training, bn_decay=self.bn_decay,
            scope='rcnn-sa3', bn=True)

        point_feats = tf.reshape(l3_points, [batch_size, -1])
        img_feats = tf.reshape(img_rois, [batch_size, -1])
        feats = tf.concat([point_feats, img_feats], axis=-1)
        #tf.summary.scalar('img_features', tf.reduce_mean(img_feats))
        #tf.summary.scalar('point_features', tf.reduce_mean(point_feats))

        # Classification
        cls_net = tf_util.fully_connected(img_feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay)
        #cls_net = tf_util.fully_connected(point_feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay)
        #cls_net = tf_util.fully_connected(feats, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc1', bn_decay=self.bn_decay)
        cls_net = tf_util.dropout(cls_net, keep_prob=0.5, is_training=is_training, scope='rcnn-cls-dp1')
        cls_net = tf_util.fully_connected(cls_net, 256, bn=True, is_training=is_training, scope='rcnn-cls-fc2', bn_decay=self.bn_decay)
        cls_net = tf_util.dropout(cls_net, keep_prob=0.5, is_training=is_training, scope='rcnn-cls-dp2')
        cls_net = tf_util.fully_connected(cls_net, NUM_OBJ_CLASSES, activation_fn=None, scope='rcnn-cls-fc3')
        self.end_points['cls_logits'] = cls_net

        # Box estimation
        cls_label_pred = tf.argmax(tf.nn.softmax(cls_net), axis=1)
        one_hot_pred = tf.one_hot(cls_label_pred, NUM_OBJ_CLASSES)
        one_hot_gt = tf.one_hot(self.placeholders['class_labels'], NUM_OBJ_CLASSES)
        one_hot_vec = tf.cond(is_training, lambda: one_hot_gt, lambda: one_hot_pred)
        est_intput = tf.concat([point_feats, one_hot_vec], axis=1)
        net = tf_util.fully_connected(est_intput, 512, bn=True,
            is_training=is_training, scope='rcnn-est-fc1', bn_decay=self.bn_decay)
        net = tf_util.fully_connected(net, 256, bn=True,
            is_training=is_training, scope='rcnn-est-fc2', bn_decay=self.bn_decay)
        net = tf_util.fully_connected(net, 512, bn=True,
            is_training=is_training, scope='rcnn-est-fc3', bn_decay=self.bn_decay)
        # The first NUM_CENTER_BIN*2*2: CENTER_BIN class scores and bin residuals for (x,z)
        # next 1: center residual for y
        # next NUM_HEADING_BIN*2: heading bin class scores and residuals
        # next NUM_SIZE_CLUSTER*4: size cluster class scores and residuals(l,w,h)
        output = tf_util.fully_connected(net,
            NUM_CENTER_BIN*2*2+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER*4,
            activation_fn=None, scope='rcnn-est-out')
        self.parse_output_to_tensors(output)
        self.get_output_boxes()

    def parse_output_to_tensors(self, output):
        ''' Parse batch output to separate tensors (added to end_points)'''
        batch_size = self.batch_size
        # objectness and center
        #end_points['objectness'] = tf.slice(output, [0,0,0], [-1,-1,2])
        center_x_scores = tf.slice(output, [0,0], [-1,NUM_CENTER_BIN])
        center_x_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN],
            [-1,NUM_CENTER_BIN])
        self.end_points['center_x_scores'] = center_x_scores # (B,NUM_CENTER_BIN)
        self.end_points['center_x_residuals_normalized'] = \
            center_x_residuals_normalized # (B,NUM_CENTER_BIN)
        center_z_scores = tf.slice(output, [0,NUM_CENTER_BIN*2], [-1,NUM_CENTER_BIN])
        center_z_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN*3],
            [-1,NUM_CENTER_BIN])
        self.end_points['center_z_scores'] = center_z_scores # (B,NUM_CENTER_BIN)
        self.end_points['center_z_residuals_normalized'] = \
            center_z_residuals_normalized # (B,NUM_CENTER_BIN)
        self.end_points['center_y_residuals'] = tf.slice(output, [0,NUM_CENTER_BIN*4], [-1,1])
        # heading
        heading_scores = tf.slice(output, [0,NUM_CENTER_BIN*4+1], [-1,NUM_HEADING_BIN])
        heading_residuals_normalized = tf.slice(output, [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN],
            [-1,NUM_HEADING_BIN])
        self.end_points['heading_scores'] = heading_scores # (B,NUM_HEADING_BIN)
        self.end_points['heading_residuals_normalized'] = heading_residuals_normalized # (B,NUM_HEADING_BIN)
        # end_points['heading_residuals'] = \
        #     heading_residuals_normalized * (np.pi/NUM_HEADING_BIN) # BxNUM_HEADING_BIN
        # size
        size_scores = tf.slice(output, [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN*2],
            [-1,NUM_SIZE_CLUSTER]) # BxNUM_SIZE_CLUSTER
        size_residuals_normalized = tf.slice(output,
            [0,NUM_CENTER_BIN*4+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER], [-1,NUM_SIZE_CLUSTER*3])
        size_residuals_normalized = tf.reshape(size_residuals_normalized,
            [batch_size, NUM_SIZE_CLUSTER, 3])
        self.end_points['size_scores'] = size_scores
        self.end_points['size_residuals_normalized'] = size_residuals_normalized
        # end_points['size_residuals'] = size_residuals_normalized * \
        #     tf.expand_dims(tf.constant(type_mean_size, dtype=tf.float32), 0)
        return self.end_points

    def get_output_boxes(self):
        end_points = {}
        # adapt the dimension
        for k in ['center_x_scores', 'center_x_residuals_normalized',
            'center_z_scores', 'center_z_residuals_normalized',
            'center_y_residuals', 'heading_scores', 'heading_residuals_normalized',
            'size_scores', 'size_residuals_normalized']:
            end_points[k] = tf.expand_dims(self.end_points[k], axis=1)
        box_center, box_angle, box_size = self.box_encoder.tf_decode(end_points)
        box_center = tf.squeeze(box_center, axis=1)
        box_center = box_center + tf.slice(self.placeholders['proposal_boxes'], [0,0], [-1,3])
        box_angle = tf.squeeze(box_angle, axis=1)
        box_angle += tf.gather(self.placeholders['proposal_boxes'], 6, axis=-1) # resotre absoluate angle
        box_size = tf.squeeze(box_size, axis=1)
        self.end_points['box_center'] = box_center
        self.end_points['box_angle'] = box_angle
        self.end_points['box_size'] = box_size
        corners_3d = get_box3d_corners_helper(box_center, box_angle, box_size)
        self.end_points['box_corners'] = corners_3d
        # box score
        seg_scores = tf.reduce_max(tf.nn.softmax(self.end_points['cls_logits']), axis=-1) # (B,)
        bin_x_scores = tf.reduce_max(tf.nn.softmax(self.end_points['center_x_scores']), axis=-1) # (B,M)
        bin_z_scores = tf.reduce_max(tf.nn.softmax(self.end_points['center_z_scores']), axis=-1) # (B,M)
        heading_scores = tf.reduce_max(tf.nn.softmax(self.end_points['heading_scores']), axis=-1) # (B,M)
        size_scores = tf.reduce_max(tf.nn.softmax(self.end_points['size_scores']), axis=-1) # (B,M)
        # confidence = seg_scores + bin_x_scores + bin_z_scores + heading_scores + size_scores
        confidence = seg_scores * bin_x_scores * bin_z_scores * heading_scores * size_scores
        self.end_points['box_score'] = confidence
        return corners_3d

    def get_loss(self):
        end_points = self.end_points
        cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\
            logits=end_points['cls_logits'], labels=self.placeholders['class_labels']))
        tf.summary.scalar('classification loss', cls_loss)
        # is_obj_mask = tf.to_float(tf.not_equal(self.placeholders['class_labels'], 0))
        train_reg_mask = tf.to_float(self.placeholders['train_regression'])
        center_x_cls_loss = tf.reduce_mean(train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits(\
           logits=end_points['center_x_scores'], labels=self.placeholders['center_bin_x_labels']))
        center_z_cls_loss = tf.reduce_mean(train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits(\
           logits=end_points['center_z_scores'], labels=self.placeholders['center_bin_z_labels']))
        bin_x_onehot = tf.one_hot(self.placeholders['center_bin_x_labels'],
            depth=NUM_CENTER_BIN,
            on_value=1, off_value=0, axis=-1) # BxNUM_CENTER_BIN
        # NOTICE: labels['center_x_residuals'] is already normalized
        center_x_residuals_normalized = tf.reduce_sum(end_points['center_x_residuals_normalized']*tf.to_float(bin_x_onehot), axis=-1) # B
        center_x_residuals_dist = tf.norm(self.placeholders['center_x_res_labels'] - center_x_residuals_normalized, axis=-1)
        center_x_res_loss = huber_loss(train_reg_mask*center_x_residuals_dist, delta=1.0)
        bin_z_onehot = tf.one_hot(self.placeholders['center_bin_z_labels'],
            depth=NUM_CENTER_BIN,
            on_value=1, off_value=0, axis=-1) # BxNUM_CENTER_BIN
        center_z_residuals_normalized = tf.reduce_sum(end_points['center_z_residuals_normalized']*tf.to_float(bin_z_onehot), axis=-1) # B
        center_z_residuals_dist = tf.norm(self.placeholders['center_z_res_labels'] - center_z_residuals_normalized, axis=-1)
        center_z_res_loss = huber_loss(train_reg_mask*center_z_residuals_dist, delta=1.0)
        # y is directly regressed
        center_y_residuals_dist = tf.norm(self.placeholders['center_y_res_labels'] - tf.gather(end_points['center_y_residuals'], 0, axis=-1), axis=-1)
        center_y_res_loss = huber_loss(train_reg_mask*center_y_residuals_dist, delta=1.0)
        tf.summary.scalar('center_x  class loss', center_x_cls_loss)
        tf.summary.scalar('center_z  class loss', center_z_cls_loss)
        tf.summary.scalar('center_x residual loss', center_x_res_loss)
        tf.summary.scalar('center_y residual loss', center_y_res_loss)
        tf.summary.scalar('center_z residual loss', center_z_res_loss)
        # Heading loss
        heading_class_loss = tf.reduce_mean( \
            train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits( \
            logits=end_points['heading_scores'], labels=self.placeholders['heading_bin_labels']))
        hcls_onehot = tf.one_hot(self.placeholders['heading_bin_labels'],
            depth=NUM_HEADING_BIN,
            on_value=1, off_value=0, axis=-1) # BxNxNUM_HEADING_BIN
        heading_residual_normalized_label = self.placeholders['heading_res_labels']
        heading_res_dist = tf.norm(tf.reduce_sum( \
            end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=-1) - \
            heading_residual_normalized_label)
        heading_res_loss = huber_loss(train_reg_mask*heading_res_dist, delta=1.0)
        tf.summary.scalar('heading class loss', heading_class_loss)
        tf.summary.scalar('heading residual loss', heading_res_loss)
        # Size loss
        size_class_loss = tf.reduce_mean( \
            train_reg_mask*tf.nn.sparse_softmax_cross_entropy_with_logits( \
            logits=end_points['size_scores'], labels=self.placeholders['size_class_labels']))

        scls_onehot = tf.one_hot(self.placeholders['size_class_labels'],
            depth=NUM_SIZE_CLUSTER,
            on_value=1, off_value=0, axis=-1) # BxNUM_SIZE_CLUSTER
        scls_onehot_tiled = tf.tile(tf.expand_dims( \
            tf.to_float(scls_onehot), -1), [1,1,3]) # BxNUM_SIZE_CLUSTERx3
        predicted_size_residual_normalized = tf.reduce_sum( \
            end_points['size_residuals_normalized']*scls_onehot_tiled, axis=1) # Bx3

        size_residual_label_normalized = self.placeholders['size_res_labels'] # Bx3

        size_dist = tf.norm(size_residual_label_normalized - predicted_size_residual_normalized, axis=-1)
        size_res_loss = huber_loss(train_reg_mask*size_dist, delta=1.0)
        tf.summary.scalar('size class loss', size_class_loss)
        tf.summary.scalar('size residual loss', size_res_loss)

        obj_cls_weight = 1
        cls_weight = 1
        res_weight = 1
        total_loss = obj_cls_weight * cls_loss + \
            cls_weight * (center_x_cls_loss + center_z_cls_loss + heading_class_loss + size_class_loss) + \
            res_weight * (center_x_res_loss + center_z_res_loss + center_y_res_loss + heading_res_loss + size_res_loss)

        loss_endpoints = {
            #'size_class_loss': size_class_loss,
            'size_res_loss': size_res_loss,
            #'heading_class_loss': heading_class_loss,
            #'heading_res_loss': heading_res_loss,
            #'center_x_cls_loss': center_x_cls_loss,
            #'center_z_cls_loss': center_z_cls_loss,
            #'center_x_res_loss': center_x_res_loss,
            #'center_z_res_loss': center_z_res_loss,
            #'center_y_res_loss': center_y_res_loss,
            #'mask_loss': cls_loss
            #'mean_size_label': mean_size_label,
            'size_residuals_normalized': end_points['size_residuals_normalized']
        }
        return total_loss, loss_endpoints
Exemple #2
0
class RPN(object):
    """docstring for RPN."""
    def __init__(self,
                 batch_size,
                 num_point,
                 num_channel=4,
                 bn_decay=None,
                 is_training=True):
        self.batch_size = batch_size
        self.num_point = num_point
        self.num_channel = num_channel
        self.bn_decay = bn_decay
        self.is_training = is_training
        self.end_points = {}
        self.box_encoder = BoxEncoder(CENTER_SEARCH_RANGE, NUM_CENTER_BIN,
                                      HEADING_SEARCH_RANGE, NUM_HEADING_BIN)
        self.placeholders = self.get_placeholders()
        self.build()

    def get_placeholders(self):
        batch_size = self.batch_size
        num_point = self.num_point
        return {
            'pointclouds':
            tf.placeholder(tf.float32,
                           shape=(batch_size, num_point, self.num_channel)),
            'img_inputs':
            tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)),
            'calib':
            tf.placeholder(tf.float32, shape=(batch_size, 3, 4)),
            'seg_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'center_bin_x_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'center_bin_z_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'center_x_residuals_labels':
            tf.placeholder(tf.float32, shape=(batch_size, num_point)),
            'center_z_residuals_labels':
            tf.placeholder(tf.float32, shape=(batch_size, num_point)),
            'center_y_residuals_labels':
            tf.placeholder(tf.float32, shape=(batch_size, num_point)),
            'heading_bin_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'heading_residuals_labels':
            tf.placeholder(tf.float32, shape=(batch_size, num_point)),
            'size_class_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'size_residuals_labels':
            tf.placeholder(tf.float32, shape=(batch_size, num_point, 3)),
            'gt_boxes':
            tf.placeholder(tf.float32, shape=(batch_size, None, 8, 3)),
            'gt_box_of_point':
            tf.placeholder(tf.float32, shape=(batch_size, num_point, 8, 3)),
            'img_seg_softmax':
            tf.placeholder(tf.float32,
                           shape=(batch_size, num_point, NUM_SEG_CLASSES)),
            'is_training_pl':
            tf.placeholder(tf.bool, shape=())
        }

    def parse_output_to_tensors(self, output, end_points):
        ''' Parse batch output to separate tensors (added to end_points)
        Input:
            output: TF tensor in shape (B,N,NUM_CENTER_BIN*2*2+1+NUM_HEADING_BIN*2+NUM_SIZE_CLUSTER*4)
            end_points: dict
        Output:
            end_points: dict (updated)
        '''
        batch_size = output.get_shape()[0].value
        npoints = output.get_shape()[1].value
        # objectness and center
        #end_points['objectness'] = tf.slice(output, [0,0,0], [-1,-1,2])
        center_x_scores = tf.slice(output, [0, 0, 0], [-1, -1, NUM_CENTER_BIN])
        center_x_residuals_normalized = tf.slice(output,
                                                 [0, 0, NUM_CENTER_BIN],
                                                 [-1, -1, NUM_CENTER_BIN])
        end_points['center_x_scores'] = center_x_scores  # (B,N,NUM_CENTER_BIN)
        end_points['center_x_residuals_normalized'] = \
            center_x_residuals_normalized # (B,N,NUM_CENTER_BIN)
        center_z_scores = tf.slice(output, [0, 0, NUM_CENTER_BIN * 2],
                                   [-1, -1, NUM_CENTER_BIN])
        center_z_residuals_normalized = tf.slice(output,
                                                 [0, 0, NUM_CENTER_BIN * 3],
                                                 [-1, -1, NUM_CENTER_BIN])
        end_points['center_z_scores'] = center_z_scores  # (B,N,NUM_CENTER_BIN)
        end_points['center_z_residuals_normalized'] = \
            center_z_residuals_normalized # (B,N,NUM_CENTER_BIN)
        end_points['center_y_residuals'] = tf.slice(output,
                                                    [0, 0, NUM_CENTER_BIN * 4],
                                                    [-1, -1, 1])
        # heading
        heading_scores = tf.slice(output, [0, 0, NUM_CENTER_BIN * 4 + 1],
                                  [-1, -1, NUM_HEADING_BIN])
        heading_residuals_normalized = tf.slice(
            output, [0, 0, NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN],
            [-1, -1, NUM_HEADING_BIN])
        end_points['heading_scores'] = heading_scores  # (B,N,NUM_HEADING_BIN)
        end_points[
            'heading_residuals_normalized'] = heading_residuals_normalized  # (B,N,NUM_HEADING_BIN)
        # end_points['heading_residuals'] = \
        #     heading_residuals_normalized * (np.pi/NUM_HEADING_BIN) # BxNUM_HEADING_BIN
        # size
        size_scores = tf.slice(
            output, [0, 0, NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN * 2],
            [-1, -1, NUM_SIZE_CLUSTER])  # BxNUM_SIZE_CLUSTER
        size_residuals_normalized = tf.slice(output, [
            0, 0,
            NUM_CENTER_BIN * 4 + 1 + NUM_HEADING_BIN * 2 + NUM_SIZE_CLUSTER
        ], [-1, -1, NUM_SIZE_CLUSTER * 3])
        size_residuals_normalized = tf.reshape(
            size_residuals_normalized,
            [batch_size, npoints, NUM_SIZE_CLUSTER, 3])
        end_points['size_scores'] = size_scores
        end_points['size_residuals_normalized'] = size_residuals_normalized
        # end_points['size_residuals'] = size_residuals_normalized * \
        #     tf.expand_dims(tf.constant(type_mean_size, dtype=tf.float32), 0)
        box_center, box_angle, box_size = self.box_encoder.tf_decode(
            end_points)
        box_center = box_center + end_points['fg_points_xyz']
        box_num = batch_size * npoints
        corners_3d = get_box3d_corners_helper(
            tf.reshape(box_center, [box_num, 3]),
            tf.reshape(box_angle, [box_num]),
            tf.reshape(box_size, [box_num, 3]))
        end_points['proposal_boxes'] = tf.reshape(corners_3d,
                                                  [batch_size, npoints, 8, 3])
        return end_points

    def build_img_extractor(self):
        self._img_pixel_size = np.asarray([360, 1200])
        VGG_config = namedtuple(
            'VGG_config',
            'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay')
        self._img_feature_extractor = ImgVggPyr(
            VGG_config(
                **{
                    'vgg_conv1': [2, 32],
                    'vgg_conv2': [2, 64],
                    'vgg_conv3': [3, 128],
                    'vgg_conv4': [3, 256],
                    'l2_weight_decay': 0.0005
                }))
        self._img_preprocessed = \
            self._img_feature_extractor.preprocess_input(
                self.placeholders['img_inputs'], self._img_pixel_size)
        self.img_feature_maps, self.img_end_points = \
            self._img_feature_extractor.build(
                self._img_preprocessed,
                self._img_pixel_size,
                self.is_training)
        #return self.img_feature_maps
        self.img_bottleneck = slim.conv2d(
            self.img_feature_maps,
            128,
            [1, 1],
            #2, [1, 1],
            scope='bottleneck',
            normalizer_fn=slim.batch_norm,
            #normalizer_fn=None,
            normalizer_params={'is_training': self.is_training})
        return self.img_bottleneck

    def get_segmentation_net(self, point_cloud, is_training, bn_decay,
                             end_points):
        ''' 3D instance segmentation PointNet v2 network.
        Input:
            point_cloud: TF tensor in shape (B,N,4)
                frustum point clouds with XYZ and intensity in point channels
                XYZs are in frustum coordinate
            is_training: TF boolean scalar
            bn_decay: TF float scalar
            end_points: dict
        Output:
            logits: TF tensor in shape (B,N,2), scores for bkg/clutter and object
            end_points: dict
        '''
        l0_xyz = tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3])
        l0_points = tf.slice(point_cloud, [0, 0, 3], [-1, -1, NUM_CHANNEL - 3])

        # Set abstraction layers
        l1_xyz, l1_points = pointnet_sa_module_msg(
            l0_xyz,
            l0_points,
            4096, [0.1, 0.5], [16, 32], [[16, 16, 32], [32, 32, 64]],
            is_training,
            bn_decay,
            scope='layer1',
            bn=True)
        l2_xyz, l2_points = pointnet_sa_module_msg(
            l1_xyz,
            l1_points,
            1024, [0.5, 1.0], [16, 32], [[64, 64, 128], [64, 96, 128]],
            is_training,
            bn_decay,
            scope='layer2',
            bn=True)
        l3_xyz, l3_points = pointnet_sa_module_msg(
            l2_xyz,
            l2_points,
            256, [1.0, 2.0], [16, 32], [[128, 196, 256], [128, 196, 256]],
            is_training,
            bn_decay,
            scope='layer3',
            bn=True)
        l4_xyz, l4_points = pointnet_sa_module_msg(
            l3_xyz,
            l3_points,
            64, [2.0, 4.0], [16, 32], [[256, 256, 512], [256, 384, 512]],
            is_training,
            bn_decay,
            scope='layer4',
            bn=True)

        # Feature Propagation layers
        l3_points = pointnet_fp_module(l3_xyz,
                                       l4_xyz,
                                       l3_points,
                                       l4_points, [512, 512],
                                       is_training,
                                       bn_decay,
                                       scope='fa_layer2',
                                       bn=True)
        l2_points = pointnet_fp_module(l2_xyz,
                                       l3_xyz,
                                       l2_points,
                                       l3_points, [512, 512],
                                       is_training,
                                       bn_decay,
                                       scope='fa_layer3',
                                       bn=True)
        l1_points = pointnet_fp_module(l1_xyz,
                                       l2_xyz,
                                       l1_points,
                                       l2_points, [256, 256],
                                       is_training,
                                       bn_decay,
                                       scope='fa_layer4',
                                       bn=True)
        l0_points = pointnet_fp_module(l0_xyz,
                                       l1_xyz,
                                       tf.concat([l0_xyz, l0_points], axis=-1),
                                       l1_points, [128, 128],
                                       is_training,
                                       bn_decay,
                                       scope='fa_layer5',
                                       bn=True)
        end_points['point_feats'] = tf.concat([l0_xyz, l0_points],
                                              axis=-1)  # (B, N, 3+C1)
        end_points['point_feats_fuse'] = tf.concat(
            [end_points['point_feats'], end_points['point_img_feats']],
            axis=-1)  # (B, N, 3+C1+C2)
        semantic_features = tf.concat(
            [l0_points, end_points['point_img_feats']],
            axis=-1)  # (B, N, C1+C2)
        #end_points['point_feats_fuse'] = end_points['point_feats']
        #semantic_features = l0_points
        # FC layers
        net = tf_util.dropout(semantic_features,
                              keep_prob=0.5,
                              is_training=is_training,
                              scope='dp0')
        net = tf_util.conv1d(net,
                             128,
                             1,
                             padding='VALID',
                             bn=True,
                             is_training=is_training,
                             scope='conv1d-fc1',
                             bn_decay=bn_decay)
        net = tf_util.dropout(net,
                              keep_prob=0.7,
                              is_training=is_training,
                              scope='dp1')
        logits = tf_util.conv1d(net,
                                NUM_SEG_CLASSES,
                                1,
                                padding='VALID',
                                activation_fn=None,
                                scope='conv1d-fc2')
        end_points['foreground_logits'] = logits

        return end_points

    def reduce_proposals(self, end_points):
        '''Use NMS to reduce the number of proposals'''
        batch_size = end_points['fg_points_xyz'].shape[0]
        # confidence
        fg_logits = tf.gather_nd(end_points['foreground_logits'],
                                 end_points['fg_point_indices'])  # (B,M)
        seg_scores = tf.reduce_max(tf.nn.softmax(fg_logits), axis=-1)  # (B,M)
        bin_x_scores = tf.reduce_max(tf.nn.softmax(
            end_points['center_x_scores']),
                                     axis=-1)  # (B,M)
        bin_z_scores = tf.reduce_max(tf.nn.softmax(
            end_points['center_z_scores']),
                                     axis=-1)  # (B,M)
        heading_scores = tf.reduce_max(tf.nn.softmax(
            end_points['heading_scores']),
                                       axis=-1)  # (B,M)
        size_scores = tf.reduce_max(tf.nn.softmax(end_points['size_scores']),
                                    axis=-1)  # (B,M)
        # confidence = seg_scores + bin_x_scores + bin_z_scores + heading_scores + size_scores
        confidence = seg_scores * bin_x_scores * bin_z_scores * heading_scores * size_scores
        confidence.set_shape([batch_size, NUM_FG_POINT])
        end_points['proposal_scores'] = confidence
        # BEV boxes
        boxes_3d = end_points['proposal_boxes']  # (B,M,8,3)
        corners_min = tf.gather(tf.reduce_min(boxes_3d, axis=2), [0, 2],
                                axis=-1)
        corners_max = tf.gather(tf.reduce_max(boxes_3d, axis=2), [0, 2],
                                axis=-1)  # (B,M,2) x,z
        boxes_bev = tf.concat([corners_min, corners_max], axis=-1)  # (B,M,4)
        boxes_bev.set_shape([batch_size, NUM_FG_POINT, 4])

        confidence_unpack = tf.unstack(confidence, axis=0)
        boxes_bev_unpack = tf.unstack(boxes_bev, axis=0)
        #boxes_3d_unpack = tf.unstack(end_points['proposal_boxes'], axis=0)
        #boxes_3d_list = []
        batch_nms_indices = []
        for i in range(len(confidence_unpack)):
            nms_indices = tf.image.non_max_suppression(boxes_bev_unpack[i],
                                                       confidence_unpack[i],
                                                       300)  # at most 300
            #boxes_3d_list.append(tf.gather(boxes_3d_unpack[i], nms_indices))
            nms_indices = tf.pad(
                nms_indices, [[0, NUM_FG_POINT - tf.shape(nms_indices)[0]]],
                mode='CONSTANT',
                constant_values=-1)
            batch_nms_indices.append(nms_indices)
        end_points['nms_indices'] = tf.stack(batch_nms_indices, axis=0)
        return end_points

    def get_region_proposal_net(self, point_feats, is_training, bn_decay,
                                end_points):
        batch_size = point_feats.get_shape()[0].value
        npoints = point_feats.get_shape()[1].value
        point_feats = tf.slice(point_feats, [0, 0, 3],
                               [-1, -1, -1])  # (B, N, D)
        net = tf.reshape(point_feats, [batch_size * npoints, -1])
        # Fully connected layers
        net = tf_util.fully_connected(net,
                                      256,
                                      bn=True,
                                      is_training=is_training,
                                      scope='rp-fc0',
                                      bn_decay=bn_decay)
        #net = tf_util.dropout(net, keep_prob=0.7,
        #    is_training=is_training, scope='rp-dp0')
        net = tf_util.fully_connected(net,
                                      256,
                                      bn=True,
                                      is_training=is_training,
                                      scope='rp-fc1',
                                      bn_decay=bn_decay)
        #net = tf_util.dropout(net, keep_prob=0.7,
        #    is_training=is_training, scope='rp-dp1')
        net = tf_util.fully_connected(net,
                                      512,
                                      bn=True,
                                      is_training=is_training,
                                      scope='rp-fc2',
                                      bn_decay=bn_decay)
        #net = tf_util.dropout(net, keep_prob=0.7,
        #    is_training=is_training, scope='rp-dp2')
        # The first NUM_CENTER_BIN*2*2: CENTER_BIN class scores and bin residuals for (x,z)
        # next 1: center residual for y
        # next NUM_HEADING_BIN*2: heading bin class scores and residuals
        # next NUM_SIZE_CLUSTER*4: size cluster class scores and residuals(l,w,h)
        output = tf_util.fully_connected(net,
                                         NUM_CENTER_BIN * 2 * 2 + 1 +
                                         NUM_HEADING_BIN * 2 +
                                         NUM_SIZE_CLUSTER * 4,
                                         activation_fn=None,
                                         scope='rp-fc3')
        end_points['proposals'] = output
        return output

    def build(self):
        point_cloud = self.placeholders['pointclouds']
        is_training = self.placeholders['is_training_pl']
        mask_label = self.placeholders['seg_labels']
        bn_decay = self.bn_decay
        end_points = self.end_points

        #with tf.device('/gpu:0'):
        img_feature_maps = self.build_img_extractor()  # (B,360,1200,C)
        pts2d = projection.tf_rect_to_image(
            tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]),
            self.placeholders['calib'])
        pts2d = tf.cast(pts2d, tf.int32)  #(B,N,2)
        indices = tf.concat(
            [
                tf.expand_dims(tf.tile(tf.range(0, self.batch_size),
                                       [self.num_point]),
                               axis=-1),  # (B*N, 1)
                tf.reshape(pts2d, [self.batch_size * self.num_point, 2])
            ],
            axis=-1)  # (B*N,3)
        indices = tf.gather(indices, [0, 2, 1],
                            axis=-1)  # image's shape is (y,x)
        end_points['point_img_feats'] = tf.reshape(
            tf.gather_nd(img_feature_maps, indices),  # (B*N,C)
            [self.batch_size, self.num_point, -1])  # (B,N,C)

        end_points = self.get_segmentation_net(point_cloud, is_training,
                                               bn_decay, end_points)

        #with tf.device('/gpu:1'):
        #seg_softmax = tf.nn.softmax(end_points['foreground_logits'], axis=-1) + self.placeholders['img_seg_softmax']
        seg_softmax = tf.nn.softmax(end_points['foreground_logits'], axis=-1)
        seg_logits = tf.cond(is_training,
                             lambda: tf.one_hot(mask_label, NUM_SEG_CLASSES),
                             lambda: seg_softmax)
        #end_points['point_feats_fuse'] = tf.concat([end_points['point_feats_fuse'], seg_logits], axis=-1)
        # fg_point_feats include xyz
        fg_point_feats, end_points = point_cloud_masking(
            end_points['point_feats'], seg_logits, end_points,
            xyz_only=False)  # BxNUM_FG_POINTxD
        proposals = self.get_region_proposal_net(fg_point_feats, is_training,
                                                 bn_decay, end_points)
        proposals_reshaped = tf.reshape(proposals,
                                        [self.batch_size, NUM_FG_POINT, -1])
        # Parse output to 3D box parameters
        end_points = self.parse_output_to_tensors(proposals_reshaped,
                                                  end_points)
        end_points = self.reduce_proposals(end_points)
        # for iou eval
        end_points['gt_box_of_point'] = tf.gather_nd(
            self.placeholders['gt_box_of_point'],
            end_points['fg_point_indices'])
        end_points['gt_box_of_point'].set_shape(
            [self.batch_size, NUM_FG_POINT, 8, 3])
        return end_points

    def get_seg_loss(self):
        pls = self.placeholders
        end_points = self.end_points
        batch_size = self.batch_size
        # 3D Segmentation loss
        mask_loss = focal_loss(
            end_points['foreground_logits'],
            tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1))
        tf.summary.scalar('mask loss', mask_loss)
        return mask_loss, {}

    def get_loss(self):
        pls = self.placeholders
        end_points = self.end_points
        batch_size = self.batch_size
        # 3D Segmentation loss
        mask_loss = focal_loss(
            end_points['foreground_logits'],
            tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1))
        tf.summary.scalar('mask loss', mask_loss)
        #return mask_loss, {}
        # gather box estimation labels of foreground points
        labels_fg = {}
        for k in pls.keys():
            if k not in [
                    'center_bin_x_labels',
                    'center_bin_z_labels',
                    'center_x_residuals_labels',
                    'center_z_residuals_labels',
                    'center_y_residuals_labels',
                    'heading_bin_labels',
                    'heading_residuals_labels',
                    'size_class_labels',
                    'size_residuals_labels',
            ]:
                continue
            labels_fg[k] = tf.gather_nd(pls[k], end_points['fg_point_indices'])
            if k == 'size_residuals_labels':
                labels_fg[k].set_shape([batch_size, NUM_FG_POINT, 3])
            else:
                labels_fg[k].set_shape([batch_size, NUM_FG_POINT])
        # Center loss
        center_x_cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\
           logits=end_points['center_x_scores'], labels=labels_fg['center_bin_x_labels']))
        center_z_cls_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\
           logits=end_points['center_z_scores'], labels=labels_fg['center_bin_z_labels']))
        bin_x_onehot = tf.one_hot(labels_fg['center_bin_x_labels'],
                                  depth=NUM_CENTER_BIN,
                                  on_value=1,
                                  off_value=0,
                                  axis=-1)  # BxNxNUM_CENTER_BIN
        # NOTICE: labels['center_x_residuals'] is already normalized
        center_x_residuals_normalized = tf.reduce_sum(
            end_points['center_x_residuals_normalized'] *
            tf.to_float(bin_x_onehot),
            axis=2)  # BxN
        center_x_residuals_dist = tf.norm(
            labels_fg['center_x_residuals_labels'] -
            center_x_residuals_normalized,
            axis=-1)
        center_x_res_loss = huber_loss(center_x_residuals_dist, delta=2.0)
        bin_z_onehot = tf.one_hot(labels_fg['center_bin_z_labels'],
                                  depth=NUM_CENTER_BIN,
                                  on_value=1,
                                  off_value=0,
                                  axis=-1)  # BxNxNUM_CENTER_BIN
        center_z_residuals_normalized = tf.reduce_sum(
            end_points['center_z_residuals_normalized'] *
            tf.to_float(bin_z_onehot),
            axis=2)  # BxN
        center_z_residuals_dist = tf.norm(
            labels_fg['center_z_residuals_labels'] -
            center_z_residuals_normalized,
            axis=-1)
        center_z_res_loss = huber_loss(center_z_residuals_dist, delta=2.0)
        # y is directly regressed
        center_y_residuals_dist = tf.norm(
            labels_fg['center_y_residuals_labels'] -
            tf.gather(end_points['center_y_residuals'], 0, axis=-1),
            axis=-1)
        center_y_res_loss = huber_loss(center_y_residuals_dist, delta=2.0)
        tf.summary.scalar('center_x  class loss', center_x_cls_loss)
        tf.summary.scalar('center_z  class loss', center_z_cls_loss)
        tf.summary.scalar('center_x residual loss', center_x_res_loss)
        tf.summary.scalar('center_y residual loss', center_y_res_loss)
        tf.summary.scalar('center_z residual loss', center_z_res_loss)
        # Heading loss
        heading_class_loss = tf.reduce_mean( \
            tf.nn.sparse_softmax_cross_entropy_with_logits( \
            logits=end_points['heading_scores'], labels=labels_fg['heading_bin_labels']))
        hcls_onehot = tf.one_hot(labels_fg['heading_bin_labels'],
                                 depth=NUM_HEADING_BIN,
                                 on_value=1,
                                 off_value=0,
                                 axis=-1)  # BxNxNUM_HEADING_BIN
        heading_residual_normalized_label = labels_fg[
            'heading_residuals_labels']
        heading_res_dist = tf.norm(heading_residual_normalized_label - tf.reduce_sum( \
            end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=2))
        heading_res_loss = huber_loss(heading_res_dist, delta=1.0)
        tf.summary.scalar('heading class loss', heading_class_loss)
        tf.summary.scalar('heading residual loss', heading_res_loss)
        # Size loss
        size_class_loss = tf.reduce_mean( \
            tf.nn.sparse_softmax_cross_entropy_with_logits( \
            logits=end_points['size_scores'], labels=labels_fg['size_class_labels']))

        scls_onehot = tf.one_hot(labels_fg['size_class_labels'],
                                 depth=NUM_SIZE_CLUSTER,
                                 on_value=1,
                                 off_value=0,
                                 axis=-1)  # BxNxNUM_SIZE_CLUSTER
        scls_onehot_tiled = tf.tile(tf.expand_dims( \
            tf.to_float(scls_onehot), -1), [1,1,1,3]) # BxNxNUM_SIZE_CLUSTERx3
        predicted_size_residual_normalized = tf.reduce_sum( \
            end_points['size_residuals_normalized']*scls_onehot_tiled, axis=2) # BxNx3

        size_residual_label_normalized = labels_fg[
            'size_residuals_labels']  # BxNx3

        size_dist = tf.norm(size_residual_label_normalized -
                            predicted_size_residual_normalized,
                            axis=-1)
        size_res_loss = huber_loss(size_dist, delta=1.0)
        tf.summary.scalar('size class loss', size_class_loss)
        tf.summary.scalar('size residual loss', size_res_loss)

        seg_weight = 0.1
        cls_weight = 10
        res_weight = 10
        total_loss = seg_weight * mask_loss + \
            cls_weight * (center_x_cls_loss + center_z_cls_loss + heading_class_loss + size_class_loss) + \
            res_weight * (center_x_res_loss + center_z_res_loss + center_y_res_loss + heading_res_loss + size_res_loss)
        loss_endpoints = {
            'size_class_loss': size_class_loss,
            'size_res_loss': size_res_loss,
            'heading_class_loss': heading_class_loss,
            'heading_res_loss': heading_res_loss,
            'center_x_cls_loss': center_x_cls_loss,
            'center_z_cls_loss': center_z_cls_loss,
            'center_x_res_loss': center_x_res_loss,
            'center_z_res_loss': center_z_res_loss,
            'center_y_res_loss': center_y_res_loss,
            'mask_loss': mask_loss
        }

        return total_loss, loss_endpoints
Exemple #3
0
class SingleStageDetector:
    def __init__(self, batch_size, is_training):
        self.batch_size = batch_size
        self.is_training = is_training

        # placeholders
        self.placeholders_builder = PlaceHolders(self.batch_size) 
        self.placeholders_builder.get_placeholders()
        self.placeholders = self.placeholders_builder.placeholders

        self.cls_list = cfg.DATASET.KITTI.CLS_LIST
        self.cls2idx = dict([(cls, i + 1) for i, cls in enumerate(self.cls_list)])
        self.idx2cls = dict([(i + 1, cls) for i, cls in enumerate(self.cls_list)])

        # anchor_builder
        self.anchor_builder = Anchors(0, self.cls_list)

        # encoder_decoder
        self.encoder_decoder = EncoderDecoder(0)

        # postprocessor
        self.postprocessor = PostProcessor(0, len(self.cls_list))

        # loss builder
        self.loss_builder = LossBuilder(0)

        self.corner_loss = cfg.MODEL.FIRST_STAGE.CORNER_LOSS

        # head builder
        self.iou_loss = False
        self.heads = []
        head_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.HEAD
        for i in range(len(head_cfg)):
            self.heads.append(HeadBuilder(self.batch_size, 
                self.anchor_builder.anchors_num, 0, head_cfg[i], is_training))
            if self.heads[-1].layer_type == 'IoU': self.iou_loss = True

        # target assigner
        self.target_assigner = TargetAssigner(0) # first stage

        self.vote_loss = False
        # layer builder
        layer_cfg = cfg.MODEL.NETWORK.FIRST_STAGE.ARCHITECTURE
        layers = []
        for i in range(len(layer_cfg)):
            layers.append(LayerBuilder(i, self.is_training, layer_cfg)) 
            if layers[-1].layer_type == 'Vote_Layer': self.vote_loss = True
        self.layers = layers

        self.attr_velo_loss = cfg.MODEL.FIRST_STAGE.PREDICT_ATTRIBUTE_AND_VELOCITY 

        self.__init_dict()

    def __init_dict(self):
        self.output = dict()
        # sampled xyz/feature
        self.output[maps_dict.KEY_OUTPUT_XYZ] = []
        self.output[maps_dict.KEY_OUTPUT_FEATURE] = []
        # generated anchors
        self.output[maps_dict.KEY_ANCHORS_3D] = [] # generated anchors
        # vote output
        self.output[maps_dict.PRED_VOTE_OFFSET] = []
        self.output[maps_dict.PRED_VOTE_BASE] = []
        # det output
        self.output[maps_dict.PRED_CLS] = []
        self.output[maps_dict.PRED_OFFSET] = []
        self.output[maps_dict.PRED_ANGLE_CLS] = []
        self.output[maps_dict.PRED_ANGLE_RES] = []
        self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS] = []
        self.output[maps_dict.PRED_ATTRIBUTE] = []
        self.output[maps_dict.PRED_VELOCITY] = []
        # iou output
        self.output[maps_dict.PRED_IOU_3D_VALUE] = []
        # final result
        self.output[maps_dict.PRED_3D_BBOX] = []
        self.output[maps_dict.PRED_3D_SCORE] = []
        self.output[maps_dict.PRED_3D_CLS_CATEGORY] = []
        self.output[maps_dict.PRED_3D_ATTRIBUTE] = []
        self.output[maps_dict.PRED_3D_VELOCITY] = []

        self.prediction_keys = self.output.keys()
        
        self.labels = dict()
        self.labels[maps_dict.GT_CLS] = []
        self.labels[maps_dict.GT_OFFSET] = []
        self.labels[maps_dict.GT_ANGLE_CLS] = []
        self.labels[maps_dict.GT_ANGLE_RES] = []
        self.labels[maps_dict.GT_ATTRIBUTE] = []
        self.labels[maps_dict.GT_VELOCITY] = []
        self.labels[maps_dict.GT_BOXES_ANCHORS_3D] = []
        self.labels[maps_dict.GT_IOU_3D_VALUE] = []

        self.labels[maps_dict.GT_PMASK] = []
        self.labels[maps_dict.GT_NMASK] = []
        self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS] = []


    def build_img_extractor(self, img_input):
        self._img_pixel_size = np.asarray([360, 1200])
        VGG_config = namedtuple('VGG_config', 'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay')
        self._img_feature_extractor = ImgVggPyr(VGG_config(**{
            'vgg_conv1': [2, 32],
            'vgg_conv2': [2, 64],
            'vgg_conv3': [3, 128],
            'vgg_conv4': [3, 256],
            'l2_weight_decay': 0.0005
        }))
        self._img_preprocessed = \
            self._img_feature_extractor.preprocess_input(img_input, self._img_pixel_size)
        # self._img_preprocessed = img_input
        self.img_feature_maps, self.img_end_points = \
            self._img_feature_extractor.build(
                self._img_preprocessed,
                self._img_pixel_size,
                self.is_training)

        #return self.img_feature_maps
        self.img_bottleneck = slim.conv2d(
            self.img_feature_maps,
            128, [1, 1],
            #2, [1, 1],
            scope='bottleneck',
            normalizer_fn=slim.batch_norm,
            #normalizer_fn=None,
            normalizer_params={
                'is_training': self.is_training})


        return self.img_bottleneck

    def network_forward(self, point_cloud, bn_decay, img_input):
        l0_xyz = tf.slice(point_cloud, [0,0,0], [-1,-1,3])
        l0_points = tf.slice(point_cloud, [0,0,3], [-1,-1,-1])

        num_point = l0_xyz.get_shape().as_list()[1]

        img_feature_maps = self.build_img_extractor(img_input)
        pts2d = projection.tf_rect_to_image(tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]),
                                            self.placeholders[maps_dict.PL_CALIB_P2])
        pts2d = tf.cast(pts2d, tf.int32)  # (B,N,2)
        indices = tf.concat([
            tf.expand_dims(tf.tile(tf.range(0, self.batch_size), [num_point]), axis=-1),  # (B*N, 1)
            tf.reshape(pts2d, [self.batch_size * num_point, 2])
        ], axis=-1)  # (B*N,3)
        indices = tf.gather(indices, [0, 2, 1], axis=-1)  # image's shape is (y,x)
        point_img_feats = tf.reshape(tf.gather_nd(img_feature_maps, indices),  # (B*N,C)
                                     [self.batch_size, num_point, -1])  # (B,N,C)

        xyz_list, feature_list, fps_idx_list, point_img_feats_list = [l0_xyz], [l0_points], [None], [point_img_feats]

        for layer in self.layers:
            xyz_list, feature_list, fps_idx_list, point_img_feats_list = layer.build_layer(xyz_list, feature_list, fps_idx_list, bn_decay, self.output, point_img_feats_list)

        cur_head_start_idx = len(self.output[maps_dict.KEY_OUTPUT_XYZ])
        for head in self.heads:
            head.build_layer(xyz_list, feature_list, bn_decay, self.output)
        merge_head_prediction(cur_head_start_idx, self.output, self.prediction_keys)


    def model_forward(self, bn_decay=None):
        points_input_det = self.placeholders[maps_dict.PL_POINTS_INPUT]
        img_input_det = self.placeholders[maps_dict.PL_IMG_INPUT]

        # forward the point cloud
        self.network_forward(points_input_det, bn_decay, img_input_det)
 
        # generate anchors
        base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][-1]
        anchors = self.anchor_builder.generate(base_xyz) # [bs, pts_num, 1/cls_num, 7]
        self.output[maps_dict.KEY_ANCHORS_3D].append(anchors)

        if self.is_training: # training mode
            self.train_forward(-1, anchors) 
        else: # testing mode
            self.test_forward(-1, anchors)


    def train_forward(self, index, anchors):
        """
        Calculating loss
        """
        base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index]
        pred_offset = self.output[maps_dict.PRED_OFFSET][index]
        pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index]
        pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index]

        gt_boxes_3d = self.placeholders[maps_dict.PL_LABEL_BOXES_3D]
        gt_classes = self.placeholders[maps_dict.PL_LABEL_CLASSES]
        gt_angle_cls = self.placeholders[maps_dict.PL_ANGLE_CLS]
        gt_angle_res = self.placeholders[maps_dict.PL_ANGLE_RESIDUAL]

        if maps_dict.PL_LABEL_ATTRIBUTES in self.placeholders.keys():
            gt_attributes = self.placeholders[maps_dict.PL_LABEL_ATTRIBUTES]
        else: gt_attributes = None

        if maps_dict.PL_LABEL_VELOCITY in self.placeholders.keys():
            gt_velocity = self.placeholders[maps_dict.PL_LABEL_VELOCITY]
        else: gt_velocity = None

        returned_list = self.target_assigner.assign(base_xyz, anchors, gt_boxes_3d, gt_classes, gt_angle_cls, gt_angle_res, gt_velocity, gt_attributes)

        assigned_idx, assigned_pmask, assigned_nmask, assigned_gt_boxes_3d, assigned_gt_labels, assigned_gt_angle_cls, assigned_gt_angle_res, assigned_gt_velocity, assigned_gt_attribute = returned_list

        # encode offset
        assigned_gt_offset, assigned_gt_angle_cls, assigned_gt_angle_res = self.encoder_decoder.encode(base_xyz, assigned_gt_boxes_3d, anchors)

        # corner_loss
        corner_loss_angle_cls = tf.cast(tf.one_hot(assigned_gt_angle_cls, depth=cfg.MODEL.ANGLE_CLS_NUM, on_value=1, off_value=0, axis=-1), tf.float32) # bs, pts_num, cls_num, -1
        pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, corner_loss_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7]
        pred_corners = transfer_box3d_to_corners(pred_anchors_3d) # [bs, points_num, cls_num, 8, 3] 
        gt_corners = transfer_box3d_to_corners(assigned_gt_boxes_3d) # [bs, points_num, cls_num,8,3]
        self.output[maps_dict.CORNER_LOSS_PRED_BOXES_CORNERS].append(pred_corners)
        self.labels[maps_dict.CORNER_LOSS_GT_BOXES_CORNERS].append(gt_corners)
        

        self.labels[maps_dict.GT_CLS].append(assigned_gt_labels)
        self.labels[maps_dict.GT_BOXES_ANCHORS_3D].append(assigned_gt_boxes_3d)
        self.labels[maps_dict.GT_OFFSET].append(assigned_gt_offset)
        self.labels[maps_dict.GT_ANGLE_CLS].append(assigned_gt_angle_cls)
        self.labels[maps_dict.GT_ANGLE_RES].append(assigned_gt_angle_res)
        self.labels[maps_dict.GT_ATTRIBUTE].append(assigned_gt_attribute)
        self.labels[maps_dict.GT_VELOCITY].append(assigned_gt_velocity)
        self.labels[maps_dict.GT_PMASK].append(assigned_pmask)
        self.labels[maps_dict.GT_NMASK].append(assigned_nmask)

        self.loss_builder.forward(index, self.labels, self.output, self.placeholders, self.corner_loss, self.vote_loss, self.attr_velo_loss, self.iou_loss)


    def test_forward(self, index, anchors):
        base_xyz = self.output[maps_dict.KEY_OUTPUT_XYZ][index]

        pred_cls = self.output[maps_dict.PRED_CLS][index] # [bs, points_num, cls_num + 1/0]
        pred_offset = self.output[maps_dict.PRED_OFFSET][index]
        pred_angle_cls = self.output[maps_dict.PRED_ANGLE_CLS][index]
        pred_angle_res = self.output[maps_dict.PRED_ANGLE_RES][index]

        # decode predictions
        pred_anchors_3d = self.encoder_decoder.decode(base_xyz, pred_offset, pred_angle_cls, pred_angle_res, self.is_training, anchors) # [bs, points_num, cls_num, 7]
        
        # decode classification
        if cfg.MODEL.FIRST_STAGE.CLS_ACTIVATION == 'Softmax':
            # softmax 
            pred_score = tf.nn.softmax(pred_cls)
            pred_score = tf.slice(pred_score, [0, 0, 1], [-1, -1, -1])
        else: # sigmoid
            pred_score = tf.nn.sigmoid(pred_cls)

        # using IoU branch proposed by sparse-to-dense
        if self.iou_loss:
            pred_iou = self.output[maps_dict.PRED_IOU_3D_VALUE][index]
            pred_score = pred_score * pred_iou

        if len(self.output[maps_dict.PRED_ATTRIBUTE]) <= 0:
            pred_attribute = None
        else: pred_attribute = self.output[maps_dict.PRED_ATTRIBUTE][index]

        if len(self.output[maps_dict.PRED_VELOCITY]) <= 0:
            pred_velocity = None
        else: pred_velocity = self.output[maps_dict.PRED_VELOCITY][index]

        self.postprocessor.forward(pred_anchors_3d, pred_score, self.output, pred_attribute, pred_velocity)
Exemple #4
0
class ImgSegNet(object):
    """docstring for ImgSegNet."""
    def __init__(self,
                 batch_size,
                 num_point,
                 num_channel=4,
                 bn_decay=None,
                 is_training=True):
        self.batch_size = batch_size
        self.num_point = num_point
        self.num_channel = num_channel
        self.bn_decay = bn_decay
        self.is_training = is_training
        self.end_points = {}
        self.placeholders = self.get_placeholders()
        self.build()

    def get_placeholders(self):
        batch_size = self.batch_size
        num_point = self.num_point
        return {
            'pointclouds':
            tf.placeholder(tf.float32,
                           shape=(batch_size, num_point, self.num_channel)),
            'img_inputs':
            tf.placeholder(tf.float32, shape=(batch_size, 360, 1200, 3)),
            'calib':
            tf.placeholder(tf.float32, shape=(batch_size, 3, 4)),
            'seg_labels':
            tf.placeholder(tf.int32, shape=(batch_size, num_point)),
            'is_training_pl':
            tf.placeholder(tf.bool, shape=())
        }

    def build(self):
        point_cloud = self.placeholders['pointclouds']
        self._img_pixel_size = np.asarray([360, 1200])
        bn_decay = self.bn_decay
        is_training = self.placeholders['is_training_pl']
        VGG_config = namedtuple(
            'VGG_config',
            'vgg_conv1 vgg_conv2 vgg_conv3 vgg_conv4 l2_weight_decay')
        self._img_feature_extractor = ImgVggPyr(
            VGG_config(
                **{
                    'vgg_conv1': [2, 32],
                    'vgg_conv2': [2, 64],
                    'vgg_conv3': [3, 128],
                    'vgg_conv4': [3, 256],
                    'l2_weight_decay': 0.0005
                }))
        self._img_preprocessed = \
            self._img_feature_extractor.preprocess_input(
                self.placeholders['img_inputs'], self._img_pixel_size)
        self.img_feature_maps, self.img_end_points = \
            self._img_feature_extractor.build(
                self._img_preprocessed,
                self._img_pixel_size,
                self.is_training)
        '''
        self.seg_logits = slim.conv2d(
            self.img_feature_maps,
            NUM_SEG_CLASSES, [1, 1],
            scope='bottleneck',
            normalizer_fn=slim.batch_norm,
            #normalizer_fn=None,
            normalizer_params={
                'is_training': self.is_training})
        '''

        pts2d = projection.tf_rect_to_image(
            tf.slice(point_cloud, [0, 0, 0], [-1, -1, 3]),
            self.placeholders['calib'])
        pts2d = tf.cast(pts2d, tf.int32)  #(B,N,2)
        indices = tf.concat(
            [
                tf.expand_dims(tf.tile(tf.range(0, self.batch_size),
                                       [self.num_point]),
                               axis=-1),  # (B*N, 1)
                tf.reshape(pts2d, [self.batch_size * self.num_point, 2])
            ],
            axis=-1)  # (B*N,3)
        indices = tf.gather(indices, [0, 2, 1],
                            axis=-1)  # image's shape is (y,x)
        self.end_points['point_img_feats'] = tf.reshape(
            tf.gather_nd(self.img_feature_maps, indices),  # (B*N,C)
            [self.batch_size, self.num_point, -1])  # (B,N,C)

        net = tf_util.conv1d(self.end_points['point_img_feats'],
                             128,
                             1,
                             padding='VALID',
                             bn=True,
                             is_training=is_training,
                             scope='img-seg-conv1d-fc1',
                             bn_decay=bn_decay)
        net = tf_util.dropout(net,
                              keep_prob=0.7,
                              is_training=is_training,
                              scope='img-seg-dp1')
        logits = tf_util.conv1d(net,
                                NUM_SEG_CLASSES,
                                1,
                                padding='VALID',
                                activation_fn=None,
                                scope='img-seg-conv1d-fc2')
        self.end_points['foreground_logits'] = logits

    def get_seg_softmax(self):
        img_seg_softmax = tf.nn.softmax(self.end_points['foreground_logits'],
                                        axis=-1)
        return img_seg_softmax

    def get_loss(self):
        pls = self.placeholders
        end_points = self.end_points
        batch_size = self.batch_size
        # 3D Segmentation loss
        mask_loss = focal_loss(
            end_points['foreground_logits'],
            tf.one_hot(pls['seg_labels'], NUM_SEG_CLASSES, axis=-1))
        tf.summary.scalar('mask loss', mask_loss)
        return mask_loss