def inference(self, inputs, is_training): d_out = self.config.d_out feature = inputs['features'] feature = tf.layers.dense(feature, 8, activation=None, name='fc0') feature = tf.nn.leaky_relu(tf.layers.batch_normalization(feature, -1, 0.99, 1e-6, training=is_training)) feature = tf.expand_dims(feature, axis=2) # ###########################Encoder############################ f_encoder_list = [] for i in range(self.config.num_layers): #ES TODO: check 'self.dilated_res_block' and convert to our implementation, GT_res_blocks. #f_encoder_i = self.dilated_res_block(feature, inputs['xyz'][i], inputs['neigh_idx'][i], d_out[i], s_encoder_i, f_encoder_i = self.gt_res_block(feature, inputs['xyz'][i], inputs['neigh_idx'][i], d_out[i],'Encoder_layer_' + str(i), is_training, i) #print("{} : {}".format(i, f_encoder_i.shape)) f_sampled_i = self.random_sample(f_encoder_i, inputs['sub_idx'][i]) s_sampled_i = self.random_sample(s_encoder_i, inputs['sub_idx'][i]) feature = s_sampled_i if i == 0: f_encoder_list.append(f_encoder_i) f_encoder_list.append(f_sampled_i) # ###########################Encoder############################ feature = helper_tf_util.conv2d(f_encoder_list[-1], f_encoder_list[-1].get_shape()[3].value, [1, 1], 'decoder_0', [1, 1], 'VALID', True, is_training) # ###########################Decoder############################ f_decoder_list = [] for j in range(self.config.num_layers): f_interp_i = self.nearest_interpolation(feature, inputs['interp_idx'][-j - 1]) f_decoder_i = helper_tf_util.conv2d_transpose(tf.concat([f_encoder_list[-j - 2], f_interp_i], axis=3), f_encoder_list[-j - 2].get_shape()[-1].value, [1, 1], 'Decoder_layer_' + str(j), [1, 1], 'VALID', bn=True, is_training=is_training) feature = f_decoder_i f_decoder_list.append(f_decoder_i) # ###########################Decoder############################ f_layer_fc1 = helper_tf_util.conv2d(f_decoder_list[-1], 64, [1, 1], 'fc1', [1, 1], 'VALID', True, is_training) f_layer_fc2 = helper_tf_util.conv2d(f_layer_fc1, 32, [1, 1], 'fc2', [1, 1], 'VALID', True, is_training) f_layer_drop = helper_tf_util.dropout(f_layer_fc2, keep_prob=0.5, is_training=is_training, scope='dp1') f_layer_fc3 = helper_tf_util.conv2d(f_layer_drop, self.config.num_classes, [1, 1], 'fc', [1, 1], 'VALID', False, is_training, activation_fn=None) f_out = tf.squeeze(f_layer_fc3, [2]) return f_out
def inference(self, inputs, is_training): d_out = self.config.d_out feature = inputs['features'] feature = tf.layers.dense(feature, 8, activation=None, name='fc0') feature = tf.nn.leaky_relu(tf.layers.batch_normalization(feature, -1, 0.99, 1e-6, training=is_training)) feature = tf.expand_dims(feature, axis=2) # ###########################Encoder############################ f_encoder_list = [] for i in range(self.config.num_layers): f_encoder_i = self.dilated_res_block(feature, inputs['xyz'][i], inputs['neigh_idx'][i], d_out[i], 'Encoder_layer_' + str(i), is_training) f_sampled_i = self.random_sample(f_encoder_i, inputs['sub_idx'][i]) feature = f_sampled_i if i == 0: f_encoder_list.append(f_encoder_i) f_encoder_list.append(f_sampled_i) # ###########################Encoder############################ feature = helper_tf_util.conv2d(f_encoder_list[-1], f_encoder_list[-1].get_shape()[3].value, [1, 1], 'decoder_0', [1, 1], 'VALID', True, is_training) # # bboxes head # bboxes_layer_fc1 = helper_tf_util.conv2d(f_encoder_list[-1], 64, [1, 1], 'bboxes_fc1', [1, 1], 'VALID', True, is_training) # bboxes_layer_fc2 = helper_tf_util.conv2d(bboxes_layer_fc1, 32, [1, 1], 'bboxes_fc2', [1, 1], 'VALID', True, is_training) # bboxes_layer_drop = helper_tf_util.dropout(bboxes_layer_fc2, keep_prob=0.5, is_training=is_training, scope='bboxes_dp1') # bboxes_layer_fc3 = helper_tf_util.conv2d(bboxes_layer_drop, self.num_target_attributes-1, [1, 1], 'bboxes_fc', [1, 1], 'VALID', False, # is_training, activation_fn=None) # bboxes_out = tf.squeeze(bboxes_layer_fc3, [2]) # # fgbg head # fgbg_layer_fc1 = helper_tf_util.conv2d(f_encoder_list[-1], 64, [1, 1], 'fgbg_fc1', [1, 1], 'VALID', True, is_training) # fgbg_layer_fc2 = helper_tf_util.conv2d(fgbg_layer_fc1, 32, [1, 1], 'fgbg_fc2', [1, 1], 'VALID', True, is_training) # fgbg_layer_drop = helper_tf_util.dropout(fgbg_layer_fc2, keep_prob=0.5, is_training=is_training, scope='fgbg_dp1') # fgbg_layer_fc3 = helper_tf_util.conv2d(fgbg_layer_drop, 1, [1, 1], 'fgbg_fc', [1, 1], 'VALID', False, # is_training, activation_fn=None) # fgbg_out = tf.squeeze(fgbg_layer_fc3, [2]) # # classification head # cls_layer_fc1 = helper_tf_util.conv2d(f_encoder_list[-1], 64, [1, 1], 'cls_fc1', [1, 1], 'VALID', True, is_training) # cls_layer_fc2 = helper_tf_util.conv2d(cls_layer_fc1, 32, [1, 1], 'cls_fc2', [1, 1], 'VALID', True, is_training) # cls_layer_drop = helper_tf_util.dropout(cls_layer_fc2, keep_prob=0.5, is_training=is_training, scope='cls_dp1') # cls_layer_fc3 = helper_tf_util.conv2d(cls_layer_drop, self.num_classes, [1, 1], 'cls_fc', [1, 1], 'VALID', False, # is_training, activation_fn=None) # cls_out = tf.squeeze(cls_layer_fc3, [2]) # return bboxes_out, fgbg_out, cls_out f_layer_fc1 = helper_tf_util.conv2d(f_encoder_list[-1], 64, [1, 1], 'fc1', [1, 1], 'VALID', True, is_training) f_layer_fc2 = helper_tf_util.conv2d(f_layer_fc1, 32, [1, 1], 'fc2', [1, 1], 'VALID', True, is_training) f_layer_drop = helper_tf_util.dropout(f_layer_fc2, keep_prob=0.5, is_training=is_training, scope='dp1') # f_layer_fc3 = helper_tf_util.conv2d(f_layer_drop, self.num_output_attributes, [1, 1], 'fc', [1, 1], 'VALID', False, # is_training, activation_fn=None) f_layer_fc3 = helper_tf_util.conv2d(f_layer_drop, self.num_fgbg_attributes, [1, 1], 'fc', [1, 1], 'VALID', False, is_training, activation_fn=None) f_out = tf.squeeze(f_layer_fc3, [2]) return f_out
def inference(self, inputs, is_training): """similar to pytorch's forward() function where the RandLA-Net architecture is implemented by an encoder-decoder structure-yc In the encoder, LocSE block and RandomSampling is used where LocSE consists of gather_neighbors, relative_pos_encoding, att_pooling() In the decoder, nearest interpolation is used w. short-cut connections Args: inputs ([type]): a dict containing all kinds of required inputs is_training (bool): training or not Returns: tensor: logits for segmentation scores """ d_out = self.config.d_out feature = inputs['features'] # (B,N,6) feature = tf.layers.dense(feature, 8, activation=None, name='fc0') # (B,N,8) feature = tf.nn.leaky_relu( tf.layers.batch_normalization(feature, -1, 0.99, 1e-6, training=is_training)) feature = tf.expand_dims( feature, axis=2) # expand 1 more dim to use Conv2D ops, (B,N,1,8) # ###########################Encoder############################ f_encoder_list = [ ] # in the end, collect num_layers + 1 items for a group of hierarchical point feature embeddings for i in range(self.config.num_layers): f_encoder_i = self.dilated_res_block( feature, inputs['xyz'][i], inputs['neigh_idx'][i], d_out[i], 'Encoder_layer_' + str(i), is_training) # similar to LAO for local feature learning f_sampled_i = self.random_sample( f_encoder_i, inputs['sub_idx'][i]) # down-sampled the input using the idx feature = f_sampled_i if i == 0: f_encoder_list.append(f_encoder_i) f_encoder_list.append( f_sampled_i ) # (B,N,1,32), (B,N/4,1,32), (B,N/16,1,128), (B,N/64,1,256), (B,N/256,1,512), (B,N/512,1,1024) # ###########################Encoder############################ # transition using a MLP/pointwise Conv2D, e.g., (N/512,1024)-> (N/512,1024) feature = helper_tf_util.conv2d( f_encoder_list[-1], f_encoder_list[-1].get_shape()[3].value, [1, 1], 'decoder_0', [1, 1], 'VALID', True, is_training) # ###########################Decoder############################ f_decoder_list = [] for j in range(self.config.num_layers): f_interp_i = self.nearest_interpolation( feature, inputs['interp_idx'][-j - 1] ) # interpolate w. the idx, (B,N/512,1024)-> (B,N/256,1,1024) f_decoder_i = helper_tf_util.conv2d_transpose( tf.concat([f_encoder_list[-j - 2], f_interp_i], axis=3), f_encoder_list[-j - 2].get_shape()[-1].value, [1, 1], 'Decoder_layer_' + str(j), [1, 1], 'VALID', bn=True, is_training=is_training) # shortcut connection feature = f_decoder_i f_decoder_list.append(f_decoder_i) # upsampled point embeddings-yc # ###########################Decoder############################ # obtain classification scores using FCs (8->64,32(w. dropouts),num_classes) f_layer_fc1 = helper_tf_util.conv2d(f_decoder_list[-1], 64, [1, 1], 'fc1', [1, 1], 'VALID', True, is_training) f_layer_fc2 = helper_tf_util.conv2d(f_layer_fc1, 32, [1, 1], 'fc2', [1, 1], 'VALID', True, is_training) f_layer_drop = helper_tf_util.dropout(f_layer_fc2, keep_prob=0.5, is_training=is_training, scope='dp1') f_layer_fc3 = helper_tf_util.conv2d( f_layer_drop, self.config.num_classes, [1, 1], 'fc', [1, 1], 'VALID', False, is_training, activation_fn=None) # (B,N,1,num_classes) f_out = tf.squeeze(f_layer_fc3, [2]) # (B,N,num_classes) return f_out
def inference(self, inputs, is_training): d_out = self.config.d_out ratio = self.config.sub_sampling_ratio k_n = self.config.k_n feature = inputs['features'] og_xyz = feature[:, :, :3] feature = tf.layers.dense(feature, 8, activation=None, name='fc0') feature = tf.nn.leaky_relu( tf.layers.batch_normalization(feature, -1, 0.99, 1e-6, training=is_training)) feature = tf.expand_dims(feature, axis=2) # ###########################Encoder############################ f_encoder_list = [] input_xyz = og_xyz input_up_samples = [] new_xyz_list = [] xyz_list = [] n_pts = self.config.num_points for i in range(self.config.num_layers): # Farthest Point Sampling: input_neigh_idx = tf.py_func(DP.knn_search, [input_xyz, input_xyz, k_n], tf.int32) n_pts = n_pts // ratio[i] sub_xyz, inputs_sub_idx = tf.cond( tf.equal(is_training, tf.constant(True)), lambda: sampling( self.config.batch_size, n_pts, input_xyz, input_neigh_idx), lambda: sampling(self.config.val_batch_size, n_pts, input_xyz, input_neigh_idx)) inputs_interp_idx = tf.py_func(DP.knn_search, [sub_xyz, input_xyz, 1], tf.int32) input_up_samples.append(inputs_interp_idx) # Bilateral Context Encoding f_encoder_i, new_xyz = self.bilateral_context_block( feature, input_xyz, input_neigh_idx, d_out[i], 'Encoder_layer_' + str(i), is_training) f_sampled_i = self.random_sample(f_encoder_i, inputs_sub_idx) feature = f_sampled_i if i == 0: f_encoder_list.append(f_encoder_i) f_encoder_list.append(f_sampled_i) xyz_list.append(input_xyz) new_xyz_list.append(new_xyz) input_xyz = sub_xyz # ###########################Encoder############################ # ###########################Decoder############################ # Adaptive Fusion Module f_multi_decoder = [] # full-sized feature maps f_weights_decoders = [] # point-wise adaptive fusion weights for n in range(self.config.num_layers): feature = f_encoder_list[-1 - n] feature = helper_tf_util.conv2d(feature, feature.get_shape()[3].value, [1, 1], 'decoder_0' + str(n), [1, 1], 'VALID', True, is_training) f_decoder_list = [] for j in range(self.config.num_layers - n): f_interp_i = self.nearest_interpolation( feature, input_up_samples[-j - 1 - n]) f_decoder_i = helper_tf_util.conv2d_transpose( tf.concat([f_encoder_list[-j - 2 - n], f_interp_i], axis=3), f_encoder_list[-j - 2 - n].get_shape()[-1].value, [1, 1], 'Decoder_layer_' + str(n) + '_' + str(j), [1, 1], 'VALID', bn=True, is_training=is_training) feature = f_decoder_i f_decoder_list.append(f_decoder_i) # collect full-sized feature maps which are upsampled from multiple resolutions f_multi_decoder.append(f_decoder_list[-1]) # summarize point-level information curr_weight = helper_tf_util.conv2d(f_decoder_list[-1], 1, [1, 1], 'Decoder_weight_' + str(n), [1, 1], 'VALID', bn=False, activation_fn=None) f_weights_decoders.append(curr_weight) # regress the fusion parameters f_weights = tf.concat(f_weights_decoders, axis=-1) f_weights = tf.nn.softmax(f_weights, axis=-1) # adptively fuse them by calculating a weighted sum f_decoder_final = tf.zeros_like(f_multi_decoder[-1]) for i in range(len(f_multi_decoder)): f_decoder_final = f_decoder_final + tf.tile( tf.expand_dims(f_weights[:, :, :, i], axis=-1), [1, 1, 1, f_multi_decoder[i].get_shape()[-1].value ]) * f_multi_decoder[i] # ###########################Decoder############################ f_layer_fc1 = helper_tf_util.conv2d(f_decoder_final, 64, [1, 1], 'fc1', [1, 1], 'VALID', True, is_training) f_layer_fc2 = helper_tf_util.conv2d(f_layer_fc1, 32, [1, 1], 'fc2', [1, 1], 'VALID', True, is_training) f_layer_drop = helper_tf_util.dropout(f_layer_fc2, keep_prob=0.5, is_training=is_training, scope='dp1') f_layer_fc3 = helper_tf_util.conv2d(f_layer_drop, self.config.num_classes, [1, 1], 'fc', [1, 1], 'VALID', False, is_training, activation_fn=None) f_out = tf.squeeze(f_layer_fc3, [2]) return f_out, new_xyz_list, xyz_list
def inference(self, inputs, is_training): """similar to pytorch's forward() function where the SQN model architecture is implemented by an encoder-query structure Args: inputs ([type]): a dict containing all kinds of required inputs is_training (bool): training or not Returns: tensor: logits for segmentation scores """ d_out = self.config.d_out # [16, 64, 128, 256], note the channels of LFA will be doubled. feature = inputs['features'] # (B,N,6) # feature = tf.layers.dense(feature, 8, activation=None, name='fc0') # (B,N,8) # feature = tf.nn.leaky_relu(tf.layers.batch_normalization(feature, -1, 0.99, 1e-6, training=is_training)) feature = tf.expand_dims( feature, axis=2) # expand 1 more dim to use Conv2D ops, (B,N,1,8) # ###########################Encoder############################ f_encoder_list = [ ] # in the end, collect num_layers + 1 items for a group of hierarchical point feature embeddings for i in range(self.config.num_layers): f_encoder_i = self.dilated_res_block( feature, inputs['xyz'][i], inputs['neigh_idx'][i], d_out[i], 'Encoder_layer_' + str(i), is_training) # similar to LAO for local feature learning f_sampled_i = self.random_sample( f_encoder_i, inputs['sub_idx'][i]) # down-sampled the input using the idx feature = f_sampled_i if i == 0: f_encoder_list.append(f_encoder_i) f_encoder_list.append( f_sampled_i ) # (B,N,1,32), (B,N/4,1,32), (B,N/16,1,128), (B,N/64,1,256), (B,N/256,1,512) # ###########################Encoder############################ # ###########################Query Network############################ # obtain weakly points and labels for a batch using weak_label_masks # method2 using the gather_nd selected_idx = tf.where(tf.equal(self.weak_label_masks, 1)) # (n,2) weak_points = tf.gather_nd(self.points, selected_idx) weak_points_labels = tf.gather_nd(self.labels, selected_idx) # (n,) # or use method1 using boolean_mask # weak_points = tf.boolean_mask(self.points,tf.cast(self.weak_label_masks,tf.bool)) # (n,3), e.g., one batch has 26 weak pts # weakly_points_labels = tf.boolean_mask(self.labels,tf.cast(self.weak_label_masks,tf.bool)) # (n,) # obtain batch indices to denote which batch is for every weakly point batch_inds = selected_idx[:, 0] # query features for weak points f_query_feature_list = [] for i in range(self.config.num_layers): xyz_current = inputs['xyz'][ i + 1] # (B,N/4,3), index i plus 1 because the first element is the point_original features_current = f_encoder_list[ i + 1] # (B,N/4,1,32), index plus 1 because the first one is the input of encoder # if training, shape (n,1,3), otherwise (B,N,3) (main reason here is to avoid GPU OOM issue) xyz_query = tf.cond( is_training, lambda: tf.reshape(weak_points, (tf.shape(weak_points)[0], 1, 3 )), # (n,1,3) lambda: self.points) xyz_support = tf.cond( is_training, lambda: tf.gather( xyz_current, batch_inds, axis=0 ), # (B,m,3)->(n,m,3) as each weak pt might be from diff. batch lambda: xyz_current) features_support = tf.cond( is_training, lambda: tf.gather(tf.squeeze(features_current, axis=2), batch_inds, axis=0), # (B,m,C)->(n,m,C) lambda: tf.squeeze(features_current, axis=2)) # if training (n,1,C) else (B, N, C) where n is based on (B,N) and the weak_label_mask f_query_feature_i = self.three_nearest_interpolation( xyz_query, xyz_support, features_support) # (B,N,C) f_query_feature_list.append(f_query_feature_i) # concat all features, (n, 1116, 1); the tricky here is n is as batch dim, 1116 as channel dim, 1 as num_pt dim features_combined = tf.concat(f_query_feature_list, axis=-1) # (n,1,928) # obtain classification scores using FCs, (n, 1, 928)-> ...-->(n, 1, num_classes) for training # or obtain classification scores using FCs, (B, N, 928)-> ...-->(B, N, num_classes) for validation FC_LIST = [256, 128, 64, self.config.num_classes] f_layer_fc1 = helper_tf_util.conv1d(features_combined, FC_LIST[0], 1, 'fc1', 1, 'VALID', True, is_training) f_layer_fc2 = helper_tf_util.conv1d(f_layer_fc1, FC_LIST[1], 1, 'fc2', 1, 'VALID', True, is_training) f_layer_fc3 = helper_tf_util.conv1d(f_layer_fc2, FC_LIST[2], 1, 'fc3', 1, 'VALID', True, is_training) f_layer_drop = helper_tf_util.dropout(f_layer_fc3, keep_prob=0.5, is_training=is_training, scope='dp1') logits = helper_tf_util.conv1d(f_layer_drop, FC_LIST[-1], 1, 'fc4', 1, 'VALID', False, is_training, activation_fn=None) # ###########################Query Network############################ # if training, logits's shape is like (n,1,C), if validation, shape like (B, N, C) logits = tf.cond( is_training, lambda: tf.squeeze(logits, [1]), # (n, num_classes) lambda: tf.reshape(logits, [-1, tf.shape(logits)[-1]]) ) # (B*N, num_classes) return logits, weak_points_labels # (n,num_classes), (n,)