def phi_subnet(x, is_training, upsample): """ Arguments: x: a float tensor with shape [b, h, w, c]. is_training: a boolean. upsample: an integer. Returns: a float tensor with shape [b, upsample * h, upsample * w, depth]. """ x = conv2d_same(x, DEPTH, kernel_size=3, name='conv1') x = batch_norm_relu(x, is_training, name='bn1') x = conv2d_same(x, DEPTH, kernel_size=3, name='conv2') x = batch_norm_relu(x, is_training, name='bn2') if DATA_FORMAT == 'channels_first': x = tf.transpose(x, [0, 2, 3, 1]) shape = tf.shape(x) h, w = shape[1], shape[2] new_size = [upsample * h, upsample * w] x = tf.image.resize_bilinear(x, new_size) if DATA_FORMAT == 'channels_first': x = tf.transpose(x, [0, 3, 1, 2]) return x
def class_net(x, is_training, depth, level, num_anchors_per_location): """ Arguments: x: a float tensor with shape [batch_size, depth, height, width]. is_training: a boolean. depth, level, num_anchors_per_location: integers. Returns: a float tensor with shape [batch_size, num_anchors_per_location, height, width]. """ for i in range(4): x = conv2d_same(x, depth, kernel_size=3, name='conv3x3_%d' % i) x = batch_norm_relu(x, is_training, name='batch_norm_%d_for_level_%d' % (i, level)) import math p = 0.01 # probability of foreground # note that sigmoid(-log((1 - p) / p)) = p logits = tf.layers.conv2d( x, num_anchors_per_location, kernel_size=(3, 3), padding='same', bias_initializer=tf.constant_initializer(-math.log((1.0 - p) / p)), kernel_initializer=tf.random_normal_initializer(stddev=0.01), data_format=DATA_FORMAT, name='logits') return logits
def box_net(x, is_training, depth, level, num_anchors_per_location): """ Arguments: x: a float tensor with shape [batch_size, depth, height, width]. is_training: a boolean. depth, level, num_anchors_per_location: integers. Returns: a float tensor with shape [batch_size, 4 * num_anchors_per_location, height, width]. """ for i in range(4): x = conv2d_same(x, depth, kernel_size=3, name='conv3x3_%d' % i) x = batch_norm_relu(x, is_training, name='batch_norm_%d_for_level_%d' % (i, level)) encoded_boxes = tf.layers.conv2d( x, 4 * num_anchors_per_location, kernel_size=(3, 3), padding='same', bias_initializer=tf.zeros_initializer(), kernel_initializer=tf.random_normal_initializer(stddev=0.01), data_format=DATA_FORMAT, name='encoded_boxes') return encoded_boxes
def feature_pyramid_network(features, is_training, depth, min_level=3, add_coarse_features=True, scope='fpn'): """ For person detector subnetwork we use min_level=3 and add_coarse_features=True (like in the original retinanet paper). For keypoint detector subnetwork we use min_level=2 and add_coarse_features=False (like in the original multiposenet paper). Arguments: features: a dict with four float tensors. It must have keys ['c2', 'c3', 'c4', 'c5']. Where a number in a name means that a feature has stride `2 ** number`. is_training: a boolean. depth: an integer. min_level: an integer, minimal feature stride that will be used is `2 ** min_level`. Possible values are [2, 3, 4, 5] add_coarse_features: a boolean, whether to add features with strides 64 and 128. scope: a string. Returns: a dict with float tensors. """ with tf.variable_scope(scope): x = conv2d_same(features['c5'], depth, kernel_size=1, name='lateral5') p5 = conv2d_same(x, depth, kernel_size=3, name='p5') enriched_features = {'p5': p5} if add_coarse_features: p6 = conv2d_same(features['c5'], depth, kernel_size=3, stride=2, name='p6') pre_p7 = batch_norm_relu(p6, is_training, name='pre_p7_bn') p7 = conv2d_same(pre_p7, depth, kernel_size=3, stride=2, name='p7') enriched_features.update({'p6': p6, 'p7': p7}) # top-down path for i in reversed(range(min_level, 5)): lateral = conv2d_same(features[f'c{i}'], depth, kernel_size=1, name=f'lateral{i}') x = nearest_neighbor_upsample(x) + lateral p = conv2d_same(x, depth, kernel_size=3, name=f'p{i}') enriched_features[f'p{i}'] = p return enriched_features
def __init__(self, backbone_features, is_training, params): """ Arguments: backbone_features: a dict with float tensors. It contains keys ['c2', 'c3', 'c4', 'c5']. is_training: a boolean. params: a dict. """ self.enriched_features = feature_pyramid_network( backbone_features, is_training, depth=DEPTH, min_level=2, add_coarse_features=False, scope='keypoint_fpn') normalized_enriched_features = { n: batch_norm_relu(x, is_training, name=f'{n}_batch_norm') for n, x in self.enriched_features.items() } # it is a dict with keys ['p2', 'p3', 'p4', 'p5'] upsampled_features = [] for level in range(2, 6): with tf.variable_scope(f'phi_subnet_{level}'): x = normalized_enriched_features[f'p{level}'] y = phi_subnet(x, is_training, upsample=2**(level - 2)) upsampled_features.append(y) upsampled_features = tf.concat( upsampled_features, axis=1 if DATA_FORMAT == 'channels_first' else 3) x = conv2d_same(upsampled_features, 64, kernel_size=3, name='final_conv3x3') x = batch_norm_relu(x, is_training, name='final_bn') p = 0.01 # probability of a keypoint # sigmoid(-log((1 - p) / p)) = p import math value = -math.log((1.0 - p) / p) keypoints_bias = 17 * [value] bias_initializer = tf.constant_initializer(keypoints_bias + [0.0]) self.heatmaps = tf.layers.conv2d( x, NUM_KEYPOINTS + 1, kernel_size=1, padding='same', bias_initializer=bias_initializer, kernel_initializer=tf.random_normal_initializer(stddev=1e-4), data_format=DATA_FORMAT, name='heatmaps') if DATA_FORMAT == 'channels_first': self.heatmaps = tf.transpose(self.heatmaps, [0, 2, 3, 1]) self.enriched_features = { n: tf.transpose(x, [0, 2, 3, 1]) for n, x in self.enriched_features.items() }