def interp(w, i, channel_dim): ''' Input: w: A 4D block tensor of shape (n, h, w, c) i: A list of 3-tuples [(x_1, y_1, z_1), (x_2, y_2, z_2), ...], each having type (int, float, float) The 4D block represents a batch of 3D image feature volumes with c channels. The input i is a list of points to index into w via interpolation. Direct indexing is not possible due to y_1 and z_1 being float values. Output: A list of the values: [ w[x_1, y_1, z_1, :] w[x_2, y_2, z_2, :] ... w[x_k, y_k, z_k, :] ] of the same length == len(i) ''' w_as_vector = tf.reshape(w, [-1, channel_dim]) # gather expects w to be 1-d upper_l = tf.to_int32( tf_concat( 1, [i[:, 0:1], tf.floor(i[:, 1:2]), tf.floor(i[:, 2:3])])) upper_r = tf.to_int32( tf_concat( 1, [i[:, 0:1], tf.floor(i[:, 1:2]), tf.ceil(i[:, 2:3])])) lower_l = tf.to_int32( tf_concat( 1, [i[:, 0:1], tf.ceil(i[:, 1:2]), tf.floor(i[:, 2:3])])) lower_r = tf.to_int32( tf_concat( 1, [i[:, 0:1], tf.ceil(i[:, 1:2]), tf.ceil(i[:, 2:3])])) upper_l_idx = to_idx(upper_l, tf.shape(w)) upper_r_idx = to_idx(upper_r, tf.shape(w)) lower_l_idx = to_idx(lower_l, tf.shape(w)) lower_r_idx = to_idx(lower_r, tf.shape(w)) upper_l_value = tf.gather(w_as_vector, upper_l_idx) upper_r_value = tf.gather(w_as_vector, upper_r_idx) lower_l_value = tf.gather(w_as_vector, lower_l_idx) lower_r_value = tf.gather(w_as_vector, lower_r_idx) alpha_lr = tf.expand_dims(i[:, 2] - tf.floor(i[:, 2]), 1) alpha_ud = tf.expand_dims(i[:, 1] - tf.floor(i[:, 1]), 1) upper_value = (1 - alpha_lr) * upper_l_value + (alpha_lr) * upper_r_value lower_value = (1 - alpha_lr) * lower_l_value + (alpha_lr) * lower_r_value value = (1 - alpha_ud) * upper_value + (alpha_ud) * lower_value return value
def to_x1y1x2y2(box): w = tf.maximum(box[:, 2:3], 1) h = tf.maximum(box[:, 3:4], 1) x1 = box[:, 0:1] - w / 2 x2 = box[:, 0:1] + w / 2 y1 = box[:, 1:2] - h / 2 y2 = box[:, 1:2] + h / 2 return tf_concat(1, [x1, y1, x2, y2])
def to_x1y1x2y2(box): w = tf.maximum(box[:, 2:3], 1) h = tf.maximum(box[:, 3:4], 1) x1 = box[:, 0:1] - old_div(w, 2) x2 = box[:, 0:1] + old_div(w, 2) y1 = box[:, 1:2] - old_div(h, 2) y2 = box[:, 1:2] + old_div(h, 2) return tf_concat(1, [x1, y1, x2, y2])
def interp(w, i, channel_dim): ''' Input: w: A 4D block tensor of shape (n, h, w, c) i: A list of 3-tuples [(x_1, y_1, z_1), (x_2, y_2, z_2), ...], each having type (int, float, float) The 4D block represents a batch of 3D image feature volumes with c channels. The input i is a list of points to index into w via interpolation. Direct indexing is not possible due to y_1 and z_1 being float values. Output: A list of the values: [ w[x_1, y_1, z_1, :] w[x_2, y_2, z_2, :] ... w[x_k, y_k, z_k, :] ] of the same length == len(i) ''' w_as_vector = tf.reshape(w, [-1, channel_dim]) # gather expects w to be 1-d upper_l = tf.to_int32(tf_concat(1, [i[:, 0:1], tf.floor(i[:, 1:2]), tf.floor(i[:, 2:3])])) upper_r = tf.to_int32(tf_concat(1, [i[:, 0:1], tf.floor(i[:, 1:2]), tf.ceil(i[:, 2:3])])) lower_l = tf.to_int32(tf_concat(1, [i[:, 0:1], tf.ceil(i[:, 1:2]), tf.floor(i[:, 2:3])])) lower_r = tf.to_int32(tf_concat(1, [i[:, 0:1], tf.ceil(i[:, 1:2]), tf.ceil(i[:, 2:3])])) upper_l_idx = to_idx(upper_l, tf.shape(w)) upper_r_idx = to_idx(upper_r, tf.shape(w)) lower_l_idx = to_idx(lower_l, tf.shape(w)) lower_r_idx = to_idx(lower_r, tf.shape(w)) upper_l_value = tf.gather(w_as_vector, upper_l_idx) upper_r_value = tf.gather(w_as_vector, upper_r_idx) lower_l_value = tf.gather(w_as_vector, lower_l_idx) lower_r_value = tf.gather(w_as_vector, lower_r_idx) alpha_lr = tf.expand_dims(i[:, 2] - tf.floor(i[:, 2]), 1) alpha_ud = tf.expand_dims(i[:, 1] - tf.floor(i[:, 1]), 1) upper_value = (1 - alpha_lr) * upper_l_value + (alpha_lr) * upper_r_value lower_value = (1 - alpha_lr) * lower_l_value + (alpha_lr) * lower_r_value value = (1 - alpha_ud) * upper_value + (alpha_ud) * lower_value return value
def bilinear_select(H, pred_boxes, early_feat, early_feat_channels, w_offset, h_offset): ''' Function used for rezooming high level feature maps. Uses bilinear interpolation to select all channels at index (x, y) for a high level feature map, where x and y are floats. ''' grid_size = H['grid_width'] * H['grid_height'] outer_size = grid_size * H['batch_size'] fine_stride = 8. # pixels per 60x80 grid cell in 480x640 image coarse_stride = H[ 'region_size'] # pixels per 15x20 grid cell in 480x640 image batch_ids = [] x_offsets = [] y_offsets = [] for n in range(H['batch_size']): for i in range(H['grid_height']): for j in range(H['grid_width']): for k in range(H['rnn_len']): batch_ids.append([n]) x_offsets.append( [old_div(coarse_stride, 2.) + coarse_stride * j]) y_offsets.append( [old_div(coarse_stride, 2.) + coarse_stride * i]) batch_ids = tf.constant(batch_ids) x_offsets = tf.constant(x_offsets) y_offsets = tf.constant(y_offsets) pred_boxes_r = tf.reshape(pred_boxes, [outer_size * H['rnn_len'], 4]) scale_factor = old_div( coarse_stride, fine_stride) # scale difference between 15x20 and 60x80 features pred_x_center = old_div( (pred_boxes_r[:, 0:1] + w_offset * pred_boxes_r[:, 2:3] + x_offsets), fine_stride) pred_x_center_clip = tf.clip_by_value(pred_x_center, 0, scale_factor * H['grid_width'] - 1) pred_y_center = old_div( (pred_boxes_r[:, 1:2] + h_offset * pred_boxes_r[:, 3:4] + y_offsets), fine_stride) pred_y_center_clip = tf.clip_by_value(pred_y_center, 0, scale_factor * H['grid_height'] - 1) interp_indices = tf_concat( 1, [tf.to_float(batch_ids), pred_y_center_clip, pred_x_center_clip]) return interp_indices
def rezoom(H, pred_boxes, early_feat, early_feat_channels, w_offsets, h_offsets): ''' Rezoom into a feature map at multiple interpolation points in a grid. If the predicted object center is at X, len(w_offsets) == 3, and len(h_offsets) == 5, the rezoom grid will look as follows: [o o o] [o o o] [o X o] [o o o] [o o o] Where each letter indexes into the feature map with bilinear interpolation ''' grid_size = H['grid_width'] * H['grid_height'] outer_size = grid_size * H['batch_size'] indices = [] for w_offset in w_offsets: for h_offset in h_offsets: indices.append(train_utils.bilinear_select(H, pred_boxes, early_feat, early_feat_channels, w_offset, h_offset)) interp_indices = tf_concat(0, indices) rezoom_features = train_utils.interp(early_feat, interp_indices, early_feat_channels) rezoom_features_r = tf.reshape(rezoom_features, [len(w_offsets) * len(h_offsets), outer_size, H['rnn_len'], early_feat_channels]) rezoom_features_t = tf.transpose(rezoom_features_r, [1, 2, 0, 3]) return tf.reshape(rezoom_features_t, [outer_size, H['rnn_len'], len(w_offsets) * len(h_offsets) * early_feat_channels])
def bilinear_select(H, pred_boxes, early_feat, early_feat_channels, w_offset, h_offset): ''' Function used for rezooming high level feature maps. Uses bilinear interpolation to select all channels at index (x, y) for a high level feature map, where x and y are floats. ''' grid_size = H['grid_width'] * H['grid_height'] outer_size = grid_size * H['batch_size'] fine_stride = 8. # pixels per 60x80 grid cell in 480x640 image coarse_stride = H['region_size'] # pixels per 15x20 grid cell in 480x640 image batch_ids = [] x_offsets = [] y_offsets = [] for n in range(H['batch_size']): for i in range(H['grid_height']): for j in range(H['grid_width']): for k in range(H['rnn_len']): batch_ids.append([n]) x_offsets.append([coarse_stride / 2. + coarse_stride * j]) y_offsets.append([coarse_stride / 2. + coarse_stride * i]) batch_ids = tf.constant(batch_ids) x_offsets = tf.constant(x_offsets) y_offsets = tf.constant(y_offsets) pred_boxes_r = tf.reshape(pred_boxes, [outer_size * H['rnn_len'], 4]) scale_factor = coarse_stride / fine_stride # scale difference between 15x20 and 60x80 features pred_x_center = (pred_boxes_r[:, 0:1] + w_offset * pred_boxes_r[:, 2:3] + x_offsets) / fine_stride pred_x_center_clip = tf.clip_by_value(pred_x_center, 0, scale_factor * H['grid_width'] - 1) pred_y_center = (pred_boxes_r[:, 1:2] + h_offset * pred_boxes_r[:, 3:4] + y_offsets) / fine_stride pred_y_center_clip = tf.clip_by_value(pred_y_center, 0, scale_factor * H['grid_height'] - 1) interp_indices = tf_concat(1, [tf.to_float(batch_ids), pred_y_center_clip, pred_x_center_clip]) return interp_indices
def build_forward(H, x, phase, reuse): ''' Construct the forward model ''' grid_size = H['grid_width'] * H['grid_height'] outer_size = grid_size * H['batch_size'] input_mean = 117. x -= input_mean cnn, early_feat = googlenet_load.model(x, H, reuse) early_feat_channels = H['early_feat_channels'] early_feat = early_feat[:, :, :, :early_feat_channels] if H['deconv']: size = 3 stride = 2 pool_size = 5 with tf.variable_scope("deconv", reuse=reuse): w = tf.get_variable( 'conv_pool_w', shape=[ size, size, H['later_feat_channels'], H['later_feat_channels'] ], initializer=tf.random_normal_initializer(stddev=0.01)) cnn_s = tf.nn.conv2d(cnn, w, strides=[1, stride, stride, 1], padding='SAME') cnn_s_pool = tf.nn.avg_pool(cnn_s[:, :, :, :256], ksize=[1, pool_size, pool_size, 1], strides=[1, 1, 1, 1], padding='SAME') cnn_s_with_pool = tf_concat(3, [cnn_s_pool, cnn_s[:, :, :, 256:]]) cnn_deconv = deconv(cnn_s_with_pool, output_shape=[ H['batch_size'], H['grid_height'], H['grid_width'], 256 ], channels=[H['later_feat_channels'], 256]) cnn = tf_concat(3, (cnn_deconv, cnn[:, :, :, 256:])) elif H['avg_pool_size'] > 1: pool_size = H['avg_pool_size'] cnn1 = cnn[:, :, :, :700] cnn2 = cnn[:, :, :, 700:] cnn2 = tf.nn.avg_pool(cnn2, ksize=[1, pool_size, pool_size, 1], strides=[1, 1, 1, 1], padding='SAME') cnn = tf_concat(3, [cnn1, cnn2]) cnn = tf.reshape(cnn, [ H['batch_size'] * H['grid_width'] * H['grid_height'], H['later_feat_channels'] ]) initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope('decoder', reuse=reuse, initializer=initializer): scale_down = 0.01 lstm_input = tf.reshape( cnn * scale_down, (H['batch_size'] * grid_size, H['later_feat_channels'])) if H['use_lstm']: lstm_outputs = build_lstm_inner(H, lstm_input) else: lstm_outputs = build_overfeat_inner(H, lstm_input) pred_boxes = [] pred_logits = [] for k in range(H['rnn_len']): output = lstm_outputs[k] if phase == 'train': output = tf.nn.dropout(output, 0.5) box_weights = tf.get_variable('box_ip%d' % k, shape=(H['lstm_size'], 4)) conf_weights = tf.get_variable('conf_ip%d' % k, shape=(H['lstm_size'], H['num_classes'])) pred_boxes_step = tf.reshape( tf.matmul(output, box_weights) * 50, [outer_size, 1, 4]) pred_boxes.append(pred_boxes_step) pred_logits.append( tf.reshape(tf.matmul(output, conf_weights), [outer_size, 1, H['num_classes']])) pred_boxes = tf_concat(1, pred_boxes) pred_logits = tf_concat(1, pred_logits) pred_logits_squash = tf.reshape( pred_logits, [outer_size * H['rnn_len'], H['num_classes']]) pred_confidences_squash = tf.nn.softmax(pred_logits_squash) pred_confidences = tf.reshape( pred_confidences_squash, [outer_size, H['rnn_len'], H['num_classes']]) if H['use_rezoom']: pred_confs_deltas = [] pred_boxes_deltas = [] w_offsets = H['rezoom_w_coords'] h_offsets = H['rezoom_h_coords'] num_offsets = len(w_offsets) * len(h_offsets) rezoom_features = rezoom(H, pred_boxes, early_feat, early_feat_channels, w_offsets, h_offsets) if phase == 'train': rezoom_features = tf.nn.dropout(rezoom_features, 0.5) for k in range(H['rnn_len']): delta_features = tf_concat( 1, [lstm_outputs[k], rezoom_features[:, k, :] / 1000.]) dim = 128 delta_weights1 = tf.get_variable( 'delta_ip1%d' % k, shape=[ H['lstm_size'] + early_feat_channels * num_offsets, dim ]) # TODO: add dropout here ? ip1 = tf.nn.relu(tf.matmul(delta_features, delta_weights1)) if phase == 'train': ip1 = tf.nn.dropout(ip1, 0.5) delta_confs_weights = tf.get_variable( 'delta_ip2%d' % k, shape=[dim, H['num_classes']]) if H['reregress']: delta_boxes_weights = tf.get_variable('delta_ip_boxes%d' % k, shape=[dim, 4]) pred_boxes_deltas.append( tf.reshape( tf.matmul(ip1, delta_boxes_weights) * 5, [outer_size, 1, 4])) scale = H.get('rezoom_conf_scale', 50) pred_confs_deltas.append( tf.reshape( tf.matmul(ip1, delta_confs_weights) * scale, [outer_size, 1, H['num_classes']])) pred_confs_deltas = tf_concat(1, pred_confs_deltas) if H['reregress']: pred_boxes_deltas = tf_concat(1, pred_boxes_deltas) return pred_boxes, pred_logits, pred_confidences, pred_confs_deltas, pred_boxes_deltas return pred_boxes, pred_logits, pred_confidences
def inception_v1_base(inputs, final_endpoint='Mixed_5c', scope='InceptionV1'): """Defines the Inception V1 base architecture. This architecture is defined in: Going deeper with convolutions Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. http://arxiv.org/pdf/1409.4842v1.pdf. Args: inputs: a tensor of size [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c'] scope: Optional variable_scope. Returns: A dictionary from components of the network to the corresponding activation. Raises: ValueError: if final_endpoint is not set to one of the predefined values. """ end_points = {} with tf.variable_scope(scope, 'InceptionV1', [inputs]): with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_initializer=trunc_normal(0.01)): with slim.arg_scope([slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'): end_point = 'Conv2d_1a_7x7' net = slim.conv2d(inputs, 64, [7, 7], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_2a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Conv2d_2b_1x1' net = slim.conv2d(net, 64, [1, 1], scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Conv2d_2c_3x3' net = slim.conv2d(net, 192, [3, 3], scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_3b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 32, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_3c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 192, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_4a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 208, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 48, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 256, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4e' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 144, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 288, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4f' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_5a_2x2' net = slim.max_pool2d(net, [2, 2], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0a_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 384, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 384, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 48, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf_concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint)
def build_forward(H, x, phase, reuse): ''' Construct the forward model ''' grid_size = H['grid_width'] * H['grid_height'] outer_size = grid_size * H['batch_size'] input_mean = 117. x -= input_mean cnn, early_feat = googlenet_load.model(x, H, reuse) early_feat_channels = H['early_feat_channels'] early_feat = early_feat[:, :, :, :early_feat_channels] if H['deconv']: size = 3 stride = 2 pool_size = 5 with tf.variable_scope("deconv", reuse=reuse): w = tf.get_variable('conv_pool_w', shape=[size, size, H['later_feat_channels'], H['later_feat_channels']], initializer=tf.random_normal_initializer(stddev=0.01)) cnn_s = tf.nn.conv2d(cnn, w, strides=[1, stride, stride, 1], padding='SAME') cnn_s_pool = tf.nn.avg_pool(cnn_s[:, :, :, :256], ksize=[1, pool_size, pool_size, 1], strides=[1, 1, 1, 1], padding='SAME') cnn_s_with_pool = tf_concat(3, [cnn_s_pool, cnn_s[:, :, :, 256:]]) cnn_deconv = deconv(cnn_s_with_pool, output_shape=[H['batch_size'], H['grid_height'], H['grid_width'], 256], channels=[H['later_feat_channels'], 256]) cnn = tf_concat(3, (cnn_deconv, cnn[:, :, :, 256:])) elif H['avg_pool_size'] > 1: pool_size = H['avg_pool_size'] cnn1 = cnn[:, :, :, :700] cnn2 = cnn[:, :, :, 700:] cnn2 = tf.nn.avg_pool(cnn2, ksize=[1, pool_size, pool_size, 1], strides=[1, 1, 1, 1], padding='SAME') cnn = tf_concat(3, [cnn1, cnn2]) cnn = tf.reshape(cnn, [H['batch_size'] * H['grid_width'] * H['grid_height'], H['later_feat_channels']]) initializer = tf.random_uniform_initializer(-0.1, 0.1) with tf.variable_scope('decoder', reuse=reuse, initializer=initializer): scale_down = 0.01 lstm_input = tf.reshape(cnn * scale_down, (H['batch_size'] * grid_size, H['later_feat_channels'])) if H['use_lstm']: lstm_outputs = build_lstm_inner(H, lstm_input) else: lstm_outputs = build_overfeat_inner(H, lstm_input) pred_boxes = [] pred_logits = [] for k in range(H['rnn_len']): output = lstm_outputs[k] if phase == 'train': output = tf.nn.dropout(output, 0.5) box_weights = tf.get_variable('box_ip%d' % k, shape=(H['lstm_size'], 4)) conf_weights = tf.get_variable('conf_ip%d' % k, shape=(H['lstm_size'], H['num_classes'])) pred_boxes_step = tf.reshape(tf.matmul(output, box_weights) * 50, [outer_size, 1, 4]) pred_boxes.append(pred_boxes_step) pred_logits.append(tf.reshape(tf.matmul(output, conf_weights), [outer_size, 1, H['num_classes']])) pred_boxes = tf_concat(1, pred_boxes) pred_logits = tf_concat(1, pred_logits) pred_logits_squash = tf.reshape(pred_logits, [outer_size * H['rnn_len'], H['num_classes']]) pred_confidences_squash = tf.nn.softmax(pred_logits_squash) pred_confidences = tf.reshape(pred_confidences_squash, [outer_size, H['rnn_len'], H['num_classes']]) if H['use_rezoom']: pred_confs_deltas = [] pred_boxes_deltas = [] w_offsets = H['rezoom_w_coords'] h_offsets = H['rezoom_h_coords'] num_offsets = len(w_offsets) * len(h_offsets) rezoom_features = rezoom(H, pred_boxes, early_feat, early_feat_channels, w_offsets, h_offsets) if phase == 'train': rezoom_features = tf.nn.dropout(rezoom_features, 0.5) for k in range(H['rnn_len']): delta_features = tf_concat(1, [lstm_outputs[k], rezoom_features[:, k, :] / 1000.]) dim = 128 delta_weights1 = tf.get_variable( 'delta_ip1%d' % k, shape=[H['lstm_size'] + early_feat_channels * num_offsets, dim]) # TODO: add dropout here ? ip1 = tf.nn.relu(tf.matmul(delta_features, delta_weights1)) if phase == 'train': ip1 = tf.nn.dropout(ip1, 0.5) delta_confs_weights = tf.get_variable( 'delta_ip2%d' % k, shape=[dim, H['num_classes']]) if H['reregress']: delta_boxes_weights = tf.get_variable( 'delta_ip_boxes%d' % k, shape=[dim, 4]) pred_boxes_deltas.append(tf.reshape(tf.matmul(ip1, delta_boxes_weights) * 5, [outer_size, 1, 4])) scale = H.get('rezoom_conf_scale', 50) pred_confs_deltas.append(tf.reshape(tf.matmul(ip1, delta_confs_weights) * scale, [outer_size, 1, H['num_classes']])) pred_confs_deltas = tf_concat(1, pred_confs_deltas) if H['reregress']: pred_boxes_deltas = tf_concat(1, pred_boxes_deltas) return pred_boxes, pred_logits, pred_confidences, pred_confs_deltas, pred_boxes_deltas return pred_boxes, pred_logits, pred_confidences