def _run(): """Forward pass through the network.""" with slim.arg_scope([slim.dropout], is_training=is_training): with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer(stddev=0.01), weights_regularizer=slim.l2_regularizer(self._l2_regularization), activation_fn=tf.nn.relu, trainable=is_training): with slim.arg_scope( [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'): with slim.arg_scope( [slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm): _, grasp_image = images net = slim.conv2d( grasp_image, 64, [6, 6], stride=2, scope='conv1_1', activation_fn=None, normalizer_fn=None, normalizer_params=None) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. net = tf.nn.relu(slim.batch_norm(net, scale=False)) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1') self.activation_layers.append(net) for l in range(2, 2 + self.num_convs[0]): net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l) self.activation_layers.append(net) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2') end_points['pool2'] = net self.activation_layers.append(net) logging.debug('pool2') logging.debug(net.get_shape()) if grasp_param_names is None: grasp_param_blocks = [grasp_params] grasp_param_block_names = ['fcgrasp'] else: grasp_param_blocks = [] grasp_param_block_names = [] # Note: Creating variables must happen in a deterministic # order, otherwise some workers will look for variables on the # wrong parameter servers, so we sort the grasp_param_names # here. for block_name in sorted(grasp_param_names): offset, size = grasp_param_names[block_name] grasp_param_blocks += [ tf.slice(grasp_params, [0, offset], [-1, size]) ] grasp_param_block_names += [block_name] grasp_param_tensors = [] for block, name in zip(grasp_param_blocks, grasp_param_block_names): grasp_param_tensors += [ slim.fully_connected( block, 256, scope=name, activation_fn=None, normalizer_fn=None, normalizer_params=None) ] fcgrasp = tf.add_n(grasp_param_tensors) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False)) fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2') context = tf.reshape(fcgrasp, [-1, 1, 1, 64]) end_points['fcgrasp'] = fcgrasp # Tile the image embedding action_batch_size times to align # with the expanded action dimension of action_batch_size. # Same image is used with all the actions in a action_batch. # net pre expansion should be [batch, *, *, *] # net post expansion should be [batch x action_batch, *, *, *] if tile_batch: net = contrib_seq2seq.tile_batch(net, self._action_batch_size) net = tf.add(net, context) logging.debug('net post add %s', net) end_points['vsum'] = net self.activation_layers.append(net) logging.debug('vsum') logging.debug(net.get_shape()) for l in range(2 + sum(self.num_convs[:1]), 2 + sum(self.num_convs[:2])): net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l) logging.debug('conv%d', l) self.activation_layers.append(net) logging.debug(net.get_shape()) net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3') logging.debug('pool3') logging.debug(net.get_shape()) self.activation_layers.append(net) for l in range(2 + sum(self.num_convs[:2]), 2 + sum(self.num_convs[:3])): net = slim.conv2d( net, 64, [3, 3], scope='conv%d' % l, padding='VALID') self.activation_layers.append(net) logging.debug('final conv') logging.debug(net.get_shape()) end_points['final_conv'] = net batch_size = tf.shape(net)[0] if goal_spatial_fn is not None: goal_spatial = goal_spatial_fn() # Tile goal to match net batch size (e.g. CEM). goal_batch_size = tf.shape(goal_spatial)[0] goal_spatial = tf.tile( goal_spatial, [batch_size//goal_batch_size, 1, 1, 1]) # Merging features in style of Fang 2017. net = tf.concat([net, goal_spatial], axis=3) net = slim.flatten(net, scope='flatten') if goal_vector_fn is not None: goal_vector = goal_vector_fn() goal_batch_size = tf.shape(goal_vector)[0] goal_vector = tf.tile( goal_vector, [batch_size//goal_batch_size, 1]) net = tf.concat([net, goal_vector], axis=1) for l in range(self.hid_layers): net = slim.fully_connected(net, 64, scope='fc%d' % l) name = 'logit' if num_classes > 1: name = 'logit_%d' % num_classes logits = slim.fully_connected( net, num_classes, activation_fn=None, scope=name, normalizer_fn=None, normalizer_params=None) end_points['logits'] = logits if softmax: predictions = tf.nn.softmax(logits) else: predictions = tf.nn.sigmoid(logits) if tile_batch: if num_classes > 1: predictions = tf.reshape( predictions, [-1, self._action_batch_size, num_classes]) else: predictions = tf.reshape(predictions, [-1, self._action_batch_size]) end_points['predictions'] = predictions return logits, end_points
def yolo_v3(inputs, num_classes, is_training=False, data_format='NCHW', reuse=False, with_spp=False): """ Creates YOLO v3 model. :param inputs: a 4-D tensor of size [batch_size, height, width, channels]. Dimension batch_size may be undefined. The channel order is RGB. :param num_classes: number of predicted classes. :param is_training: whether is training or not. :param data_format: data format NCHW or NHWC. :param reuse: whether or not the network and its variables should be reused. :param with_spp: whether or not is using spp layer. :return: """ # it will be needed later on img_size = inputs.get_shape().as_list()[1:3] # transpose the inputs to NCHW if data_format == 'NCHW': inputs = tf.transpose(inputs, [0, 3, 1, 2]) # normalize values to range [0..1] inputs = inputs / 255 # set batch norm params batch_norm_params = { 'decay': _BATCH_NORM_DECAY, 'epsilon': _BATCH_NORM_EPSILON, 'scale': True, 'is_training': is_training, 'fused': None, # Use fused batch norm if possible. } # Set activation_fn and parameters for conv2d, batch_norm. with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding], data_format=data_format, reuse=reuse): with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params, biases_initializer=None, activation_fn=lambda x: tf.nn.leaky_relu( x, alpha=_LEAKY_RELU)): with tf.variable_scope('darknet-53'): route_1, route_2, inputs = darknet53(inputs) with tf.variable_scope('yolo-v3'): route, inputs = _yolo_block(inputs, 512, data_format, with_spp) detect_1 = _detection_layer(inputs, num_classes, _ANCHORS[6:9], img_size, data_format) detect_1 = tf.identity(detect_1, name='detect_1') inputs = _conv2d_fixed_padding(route, 256, 1) upsample_size = route_2.get_shape().as_list() inputs = _upsample(inputs, upsample_size, data_format) inputs = tf.concat([inputs, route_2], axis=1 if data_format == 'NCHW' else 3) route, inputs = _yolo_block(inputs, 256) detect_2 = _detection_layer(inputs, num_classes, _ANCHORS[3:6], img_size, data_format) detect_2 = tf.identity(detect_2, name='detect_2') inputs = _conv2d_fixed_padding(route, 128, 1) upsample_size = route_1.get_shape().as_list() inputs = _upsample(inputs, upsample_size, data_format) inputs = tf.concat([inputs, route_1], axis=1 if data_format == 'NCHW' else 3) _, inputs = _yolo_block(inputs, 128) detect_3 = _detection_layer(inputs, num_classes, _ANCHORS[0:3], img_size, data_format) detect_3 = tf.identity(detect_3, name='detect_3') detections = tf.concat([detect_1, detect_2, detect_3], axis=1) detections = tf.identity(detections, name='detections') return detections
def overfeat(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='overfeat', global_pool=False): """Contains the model definition for the OverFeat network. The definition for the network was obtained from: OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and Yann LeCun, 2014 http://arxiv.org/abs/1312.6229 Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 231x231. To use in fully convolutional mode, set spatial_squeeze to false. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer are returned instead. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. global_pool: Optional boolean flag. If True, the input to the classification layer is avgpooled to size 1x1, for any input size. (This is not part of the original OverFeat.) Returns: net: the output of the logits layer (if num_classes is a non-zero integer), or the non-dropped-out input to the logits layer (if num_classes is 0 or None). end_points: a dict of tensors with intermediate activations. """ with tf.variable_scope(scope, 'overfeat', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d with slim.arg_scope( [slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.conv2d(net, 512, [3, 3], scope='conv3') net = slim.conv2d(net, 1024, [3, 3], scope='conv4') net = slim.conv2d(net, 1024, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. with slim.arg_scope( [slim.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=tf.constant_initializer(0.1)): net = slim.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict( end_points_collection) if global_pool: net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') end_points['global_pool'] = net if num_classes: net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=tf.zeros_initializer(), scope='fc8') if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def nopad_inception_v3_base_129(inputs, min_depth=16, depth_multiplier=1.0, num_final_1x1_conv=0, scope=None): """Constructs a no padding Inception v3 network from inputs. Args: inputs: a tensor of size [batch_size, height, width, channels]. Must be floating point. If a pretrained checkpoint is used, pixel values should be the same as during training. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. num_final_1x1_conv: Int, number of final 1x1 conv layers. scope: Optional variable_scope. Returns: tensor_out: output tensor. end_points: a set of activations for external use, for example summaries or losses. Raises: ValueError: if depth_multiplier <= 0 """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'NopadInceptionV3', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='VALID'): # 129 x 129 x 3 end_point = 'Conv2d_1a_3x3' net = slim.conv2d(inputs, depth(32), [3, 3], scope=end_point) end_points[end_point] = net # 127 x 127 x 32 end_point = 'Conv2d_2a_3x3' net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) end_points[end_point] = net # 125 x 125 x 32 end_point = 'Conv2d_2b_3x3' net = slim.conv2d(net, depth(64), [3, 3], scope=end_point) end_points[end_point] = net # 123 x 123 x 64 end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], scope=end_point) end_points[end_point] = net # 121 x 121 x 64 end_point = 'Conv2d_3b_1x1' net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) end_points[end_point] = net # 121 x 121 x 80 end_point = 'Conv2d_4a_3x3' net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) end_points[end_point] = net # 119 x 119 x 192 end_point = 'MaxPool_5a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net # 59 x 59 x 192 # Inception blocks with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='VALID'): # Mixed_5b: 55 x 55 x 256 end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( [ _trim_border_px(branch_0, 2), # branch_0: 59 x 59 x 64 branch_1, # branch_1: 55 x 55 x 64 branch_2, # branch_2: 55 x 55 x 96 _trim_border_px(branch_3, 1) # branch_3: 57 x 57 x 32 ], 3) end_points[end_point] = net # Mixed_5c: 51 x 51 x 288 end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv_1_0c_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( [ _trim_border_px(branch_0, 2), # branch_0: 55 x 55 x 64 branch_1, # branch_1: 51 x 51 x 64 branch_2, # branch_2: 51 x 51 x 96 _trim_border_px(branch_3, 1) # branch_3: 53 x 53 x 64 ], 3) end_points[end_point] = net # Mixed_6a: 25 x 25 x 768 end_point = 'Mixed_6a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat( [ branch_0, # branch_0: 25 x 25 x 384 branch_1, # branch_1: 25 x 25 x 96 branch_2, # branch_2: 25 x 25 x 288 ], 3) end_points[end_point] = net # Mixed_6b: 17 x 17 x 768 end_point = 'Mixed_6b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(128), [1, 5], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [5, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(128), [5, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(128), [1, 5], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(128), [5, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 5], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( [ _trim_border_px(branch_0, 4), # branch_0: 25 x 25 x 192 _trim_border_px(branch_1, 2), # branch_1: 21 x 21 x 192 branch_2, # branch_2: 17 x 17 x 192 _trim_border_px(branch_3, 3) # branch_3: 23 x 23 x 192 ], 3) end_points[end_point] = net # mixed_6c: 9 x 9 x 768 end_point = 'Mixed_6c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [1, 5], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [5, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [5, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(160), [1, 5], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(160), [5, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 5], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( [ _trim_border_px(branch_0, 4), # branch_0: 17 x 17 x 192 _trim_border_px(branch_1, 2), # branch_1: 13 x 13 x 192 branch_2, # branch_2: 9 x 9 x 192 _trim_border_px(branch_3, 3) # branch_3: 15 x 15 x 192 ], 3) end_points[end_point] = net # Mixed_6d: 1 x 1 end_point = 'Mixed_6d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [1, 5], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [5, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(192), [5, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 5], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(192), [5, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 5], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( [ _trim_border_px(branch_0, 4), # branch_0: 9 x 9 x 192 _trim_border_px(branch_1, 2), # branch_1: 5 x 5 x 192 branch_2, # branch_2: 1 x 1 x 192 _trim_border_px(branch_3, 3) # branch_3: 7 x 7 x 192 ], 3) end_points[end_point] = net for i in range(num_final_1x1_conv): slim.conv2d(net, depth(256), [1, 1], scope='Final_Conv2d_{}_1x1'.format(i)) end_points['Final_Conv2d_{}_1x1'.format(i)] = net return net, end_points
def decoder(encoded, scales, styles, texture_only=False, style_size=8, image_size=(112,112), keep_prob=1.0, phase_train=True, weight_decay=0.0, reuse=None, scope='Decoder'): with tf.compat.v1.variable_scope(scope, reuse=reuse): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected], activation_fn=tf.nn.relu, # weights_initializer=tf.contrib.layers.xavier_initializer(), weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=2.0), weights_regularizer=tf.keras.regularizers.l2(0.5 * (weight_decay))): with slim.arg_scope([slim.dropout, slim.batch_norm], is_training=phase_train): with slim.arg_scope([slim.fully_connected], normalizer_fn=layer_norm, normalizer_params=None): print('{} input shape:'.format(scope), [dim.value for dim in encoded.shape]) batch_size = tf.shape(input=encoded)[0] h, w = tuple(image_size) k = 64 with tf.compat.v1.variable_scope('StyleController'): if styles is None: styles = tf.random.normal((batch_size, style_size)) net = tf.identity(styles, name='input_style') net = slim.fully_connected(net, 128, scope='fc2') print('module fc2 shape:', [dim.value for dim in net.shape]) net = slim.fully_connected(net, 128, scope='fc3') print('module fc3 shape:', [dim.value for dim in net.shape]) gamma = slim.fully_connected(net, 4*k, activation_fn=None, normalizer_fn=None, scope='fc4') gamma = tf.reshape(gamma, [-1, 1, 1, 4*k], name='gamma') print('gamma shape:', [dim.value for dim in gamma.shape]) beta = slim.fully_connected(net, 4*k, activation_fn=None, normalizer_fn=None, scope='fc5') beta = tf.reshape(beta, [-1, 1, 1, 4*k], name='beta') print('beta shape:', [dim.value for dim in beta.shape]) with tf.compat.v1.variable_scope('Decoder'): print('-- Decoder') net = encoded adain = lambda x : gamma * instance_norm(x, center=False, scale=False) + beta with slim.arg_scope([slim.conv2d_transpose, slim.conv2d], normalizer_fn=adain, normalizer_params=None): for i in range(3): net_ = conv(net, 4*k, 3, scope='res{}_0'.format(i)) net += conv(net_, 4*k, 3, activation_fn=None, biases_initializer=None, scope='res{}_1'.format(i)) print('module res{} shape:'.format(i), [dim.value for dim in net.shape]) with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected], normalizer_fn=layer_norm, normalizer_params=None): net = upscale2d(net, 2) net = conv(net, 2*k, 5, pad=2, scope='deconv1_1') print('module deconv1 shape:', [dim.value for dim in net.shape]) net = upscale2d(net, 2) net = conv(net, k, 5, pad=2, scope='deconv2_1') net = conv(net, 3, 7, pad=3, activation_fn=None, normalizer_fn=None, weights_initializer=tf.compat.v1.constant_initializer(0.0), scope='conv_image') images_rendered = tf.nn.tanh(net, name='images_rendered') print('images_rendered shape:', [dim.value for dim in images_rendered.shape]) if texture_only: return images_rendered with tf.compat.v1.variable_scope('WarpController'): print('-- WarpController') net = encoded warp_input = tf.identity(images_rendered, name='warp_input') net = slim.flatten(net) net = slim.fully_connected(net, 128, scope='fc1') print('module fc1 shape:', [dim.value for dim in net.shape]) num_ldmark = 16 # Predict the control points ldmark_mean = (np.random.normal(0,50, (num_ldmark,2)) + np.array([[0.5*h,0.5*w]])).flatten() ldmark_mean = tf.Variable(ldmark_mean.astype(np.float32), name='ldmark_mean') print('ldmark_mean shape:', [dim.value for dim in ldmark_mean.shape]) ldmark_pred = slim.fully_connected(net, num_ldmark*2, weights_initializer=tf.compat.v1.truncated_normal_initializer(stddev=1.0), normalizer_fn=None, activation_fn=None, biases_initializer=None, scope='fc_ldmark') ldmark_pred = ldmark_pred + ldmark_mean print('ldmark_pred shape:', [dim.value for dim in ldmark_pred.shape]) ldmark_pred = tf.identity(ldmark_pred, name='ldmark_pred') # Predict the displacements ldmark_diff = slim.fully_connected(net, num_ldmark*2, normalizer_fn=None, activation_fn=None, scope='fc_diff') print('ldmark_diff shape:', [dim.value for dim in ldmark_diff.shape]) ldmark_diff = tf.identity(ldmark_diff, name='ldmark_diff') ldmark_diff = tf.identity(tf.reshape(scales,[-1,1]) * ldmark_diff, name='ldmark_diff_scaled') src_pts = tf.reshape(ldmark_pred, [-1, num_ldmark ,2]) dst_pts = tf.reshape(ldmark_pred + ldmark_diff, [-1, num_ldmark, 2]) diff_norm = tf.reduce_mean(input_tensor=tf.norm(tensor=src_pts-dst_pts, axis=[1,2])) # tf.summary.scalar('diff_norm', diff_norm) # tf.summary.scalar('mark', ldmark_pred[0,0]) images_transformed, dense_flow = sparse_image_warp(warp_input, src_pts, dst_pts, regularization_weight = 1e-6, num_boundary_points=0) dense_flow = tf.identity(dense_flow, name='dense_flow') return images_transformed, images_rendered, ldmark_pred, ldmark_diff
def inception_resnet_v1(inputs, is_training=True, dropout_keep_prob=0.8, bottleneck_layer_size=128, reuse=None, scope='InceptionResnetV1'): """Creates the Inception Resnet V1 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. """ end_points = {} with tf.variable_scope(scope, 'InceptionResnetV1', [inputs], reuse=reuse): with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 149 x 149 x 32 net = slim.conv2d(inputs, 32, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') end_points['Conv2d_1a_3x3'] = net # 147 x 147 x 32 net = slim.conv2d(net, 32, 3, padding='VALID', scope='Conv2d_2a_3x3') end_points['Conv2d_2a_3x3'] = net # 147 x 147 x 64 net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') end_points['Conv2d_2b_3x3'] = net # 73 x 73 x 64 net = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_3a_3x3') end_points['MaxPool_3a_3x3'] = net # 73 x 73 x 80 net = slim.conv2d(net, 80, 1, padding='VALID', scope='Conv2d_3b_1x1') end_points['Conv2d_3b_1x1'] = net # 71 x 71 x 192 net = slim.conv2d(net, 192, 3, padding='VALID', scope='Conv2d_4a_3x3') end_points['Conv2d_4a_3x3'] = net # 35 x 35 x 256 net = slim.conv2d(net, 256, 3, stride=2, padding='VALID', scope='Conv2d_4b_3x3') end_points['Conv2d_4b_3x3'] = net # 5 x Inception-resnet-A net = slim.repeat(net, 5, block35, scale=0.17) end_points['Mixed_5a'] = net # Reduction-A with tf.variable_scope('Mixed_6a'): net = reduction_a(net, 192, 192, 256, 384) end_points['Mixed_6a'] = net # 10 x Inception-Resnet-B net = slim.repeat(net, 10, block17, scale=0.10) end_points['Mixed_6b'] = net # Reduction-B with tf.variable_scope('Mixed_7a'): net = reduction_b(net) end_points['Mixed_7a'] = net # 5 x Inception-Resnet-C net = slim.repeat(net, 5, block8, scale=0.20) end_points['Mixed_8a'] = net net = block8(net, activation_fn=None) end_points['Mixed_8b'] = net with tf.variable_scope('Logits'): end_points['PrePool'] = net #pylint: disable=no-member net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID', scope='AvgPool_1a_8x8') net = slim.flatten(net) net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='Dropout') end_points['PreLogitsFlatten'] = net net = slim.fully_connected(net, bottleneck_layer_size, activation_fn=None, scope='Bottleneck', reuse=False) return net, end_points
def resnet_v1_200(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, spatial_squeeze=True, reuse=None, scope='resnet_v1_200'): """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v1(inputs, blocks, num_classes, is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, spatial_squeeze=spatial_squeeze, reuse=reuse, scope=scope) resnet_v1_200.default_image_size = resnet_v1.default_image_size if __name__ == '__main__': input = tf.compat.v1.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input') with slim.arg_scope(resnet_arg_scope()) as sc: logits = resnet_v1_50(input)
def resnet_v2(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, spatial_squeeze=True, reuse=None, scope=None): with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope( [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with slim.arg_scope([slim.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError( 'The output_stride needs to be a multiple of 4.' ) output_stride /= 4 with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): net = resnet_utils.conv2d_same(net, 64, 6, stride=1, scope='conv1') net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') output0 = net if global_pool: net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) output1 = net if num_classes is not None: net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') if spatial_squeeze: logits = tf.squeeze(net, [1, 2], name='SpatialSqueeze') end_points = slim.utils.convert_collection_to_dict( end_points_collection) if num_classes is not None: end_points['predictions'] = slim.softmax( logits, scope='predictions') return logits, end_points, output0, output1
def model_fn(features, labels, mode, params, config): """Builds the acoustic model.""" del config hparams = params length = features.length spec = features.spec is_training = mode == tf.estimator.ModeKeys.TRAIN if is_training: onset_labels = labels.onsets offset_labels = labels.offsets velocity_labels = labels.velocities frame_labels = labels.labels frame_label_weights = labels.label_weights if hparams.stop_activation_gradient and not hparams.activation_loss: raise ValueError( 'If stop_activation_gradient is true, activation_loss must be true.' ) losses = {} with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with tf.variable_scope('onsets'): onset_outputs = acoustic_model(spec, hparams, lstm_units=hparams.onset_lstm_units, lengths=length) onset_probs = slim.fully_connected(onset_outputs, constants.MIDI_PITCHES, activation_fn=tf.sigmoid, scope='onset_probs') # onset_probs_flat is used during inference. onset_probs_flat = flatten_maybe_padded_sequences( onset_probs, length) if is_training: onset_labels_flat = flatten_maybe_padded_sequences( onset_labels, length) onset_losses = tf_utils.log_loss(onset_labels_flat, onset_probs_flat) tf.losses.add_loss(tf.reduce_mean(onset_losses)) losses['onset'] = onset_losses with tf.variable_scope('offsets'): offset_outputs = acoustic_model( spec, hparams, lstm_units=hparams.offset_lstm_units, lengths=length) offset_probs = slim.fully_connected(offset_outputs, constants.MIDI_PITCHES, activation_fn=tf.sigmoid, scope='offset_probs') # offset_probs_flat is used during inference. offset_probs_flat = flatten_maybe_padded_sequences( offset_probs, length) if is_training: offset_labels_flat = flatten_maybe_padded_sequences( offset_labels, length) offset_losses = tf_utils.log_loss(offset_labels_flat, offset_probs_flat) tf.losses.add_loss(tf.reduce_mean(offset_losses)) losses['offset'] = offset_losses with tf.variable_scope('velocity'): velocity_outputs = acoustic_model( spec, hparams, lstm_units=hparams.velocity_lstm_units, lengths=length) velocity_values = slim.fully_connected(velocity_outputs, constants.MIDI_PITCHES, activation_fn=None, scope='onset_velocities') velocity_values_flat = flatten_maybe_padded_sequences( velocity_values, length) if is_training: velocity_labels_flat = flatten_maybe_padded_sequences( velocity_labels, length) velocity_loss = tf.reduce_sum( onset_labels_flat * tf.square(velocity_labels_flat - velocity_values_flat), axis=1) tf.losses.add_loss(tf.reduce_mean(velocity_loss)) losses['velocity'] = velocity_loss with tf.variable_scope('frame'): if not hparams.share_conv_features: # TODO(eriche): this is broken when hparams.frame_lstm_units > 0 activation_outputs = acoustic_model( spec, hparams, lstm_units=hparams.frame_lstm_units, lengths=length) activation_probs = slim.fully_connected( activation_outputs, constants.MIDI_PITCHES, activation_fn=tf.sigmoid, scope='activation_probs') else: activation_probs = slim.fully_connected( onset_outputs, constants.MIDI_PITCHES, activation_fn=tf.sigmoid, scope='activation_probs') probs = [] if hparams.stop_onset_gradient: probs.append(tf.stop_gradient(onset_probs)) else: probs.append(onset_probs) if hparams.stop_activation_gradient: probs.append(tf.stop_gradient(activation_probs)) else: probs.append(activation_probs) if hparams.stop_offset_gradient: probs.append(tf.stop_gradient(offset_probs)) else: probs.append(offset_probs) combined_probs = tf.concat(probs, 2) if hparams.combined_lstm_units > 0: outputs = lstm_layer( combined_probs, hparams.combined_lstm_units, lengths=length if hparams.use_lengths else None, stack_size=hparams.combined_rnn_stack_size, use_cudnn=hparams.use_cudnn, bidirectional=hparams.bidirectional) else: outputs = combined_probs frame_probs = slim.fully_connected(outputs, constants.MIDI_PITCHES, activation_fn=tf.sigmoid, scope='frame_probs') frame_probs_flat = flatten_maybe_padded_sequences(frame_probs, length) if is_training: frame_labels_flat = flatten_maybe_padded_sequences( frame_labels, length) frame_label_weights_flat = flatten_maybe_padded_sequences( frame_label_weights, length) if hparams.weight_frame_and_activation_loss: frame_loss_weights = frame_label_weights_flat else: frame_loss_weights = None frame_losses = tf_utils.log_loss(frame_labels_flat, frame_probs_flat, weights=frame_loss_weights) tf.losses.add_loss(tf.reduce_mean(frame_losses)) losses['frame'] = frame_losses if hparams.activation_loss: if hparams.weight_frame_and_activation_loss: activation_loss_weights = frame_label_weights else: activation_loss_weights = None activation_losses = tf_utils.log_loss( frame_labels_flat, flatten_maybe_padded_sequences(activation_probs, length), weights=activation_loss_weights) tf.losses.add_loss(tf.reduce_mean(activation_losses)) losses['activation'] = activation_losses frame_predictions = frame_probs_flat > hparams.predict_frame_threshold onset_predictions = onset_probs_flat > hparams.predict_onset_threshold offset_predictions = offset_probs_flat > hparams.predict_offset_threshold frame_predictions = tf.expand_dims(frame_predictions, axis=0) onset_predictions = tf.expand_dims(onset_predictions, axis=0) offset_predictions = tf.expand_dims(offset_predictions, axis=0) velocity_values = tf.expand_dims(velocity_values_flat, axis=0) metrics_values = metrics.define_metrics( frame_probs=frame_probs, onset_probs=onset_probs, frame_predictions=frame_predictions, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values, length=features.length, sequence_label=labels.note_sequence, frame_labels=labels.labels, sequence_id=features.sequence_id, hparams=hparams) for label, loss_collection in losses.items(): loss_label = 'losses/' + label metrics_values[loss_label] = loss_collection def predict_sequence(): """Convert frame predictions into a sequence (TF).""" def _predict(frame_probs, onset_probs, frame_predictions, onset_predictions, offset_predictions, velocity_values): """Convert frame predictions into a sequence (Python).""" sequence = infer_util.predict_sequence( frame_probs=frame_probs, onset_probs=onset_probs, frame_predictions=frame_predictions, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values, hparams=hparams, min_pitch=constants.MIN_MIDI_PITCH) return sequence.SerializeToString() sequence = tf.py_func(_predict, inp=[ frame_probs[0], onset_probs[0], frame_predictions[0], onset_predictions[0], offset_predictions[0], velocity_values[0], ], Tout=tf.string, stateful=False) sequence.set_shape([]) return tf.expand_dims(sequence, axis=0) predictions = { 'frame_probs': frame_probs, 'onset_probs': onset_probs, 'frame_predictions': frame_predictions, 'onset_predictions': onset_predictions, 'offset_predictions': offset_predictions, 'velocity_values': velocity_values, 'sequence_predictions': predict_sequence(), # Include some features and labels in output because Estimator 'predict' # API does not give access to them. 'sequence_ids': features.sequence_id, 'sequence_labels': labels.note_sequence, 'frame_labels': labels.labels, 'onset_labels': labels.onsets, } for k, v in metrics_values.items(): predictions[k] = tf.stack(v) metric_ops = {k: tf.metrics.mean(v) for k, v in metrics_values.items()} train_op = None loss = None if is_training: # Creates a pianoroll labels in red and probs in green [minibatch, 88] images = {} onset_pianorolls = tf.concat([ onset_labels[:, :, :, tf.newaxis], onset_probs[:, :, :, tf.newaxis], tf.zeros(tf.shape(onset_labels))[:, :, :, tf.newaxis] ], axis=3) images['OnsetPianorolls'] = onset_pianorolls offset_pianorolls = tf.concat([ offset_labels[:, :, :, tf.newaxis], offset_probs[:, :, :, tf.newaxis], tf.zeros(tf.shape(offset_labels))[:, :, :, tf.newaxis] ], axis=3) images['OffsetPianorolls'] = offset_pianorolls activation_pianorolls = tf.concat([ frame_labels[:, :, :, tf.newaxis], frame_probs[:, :, :, tf.newaxis], tf.zeros(tf.shape(frame_labels))[:, :, :, tf.newaxis] ], axis=3) images['ActivationPianorolls'] = activation_pianorolls for name, image in images.items(): tf.summary.image(name, image) loss = tf.losses.get_total_loss() tf.summary.scalar('loss', loss) for label, loss_collection in losses.items(): loss_label = 'losses/' + label tf.summary.scalar(loss_label, tf.reduce_mean(loss_collection)) train_op = slim.optimize_loss( name='training', loss=loss, global_step=tf.train.get_or_create_global_step(), learning_rate=hparams.learning_rate, learning_rate_decay_fn=functools.partial( tf.train.exponential_decay, decay_steps=hparams.decay_steps, decay_rate=hparams.decay_rate, staircase=True), clip_gradients=hparams.clip_norm, optimizer='Adam') return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metric_ops)
def inception_resnet_v2(inputs, num_classes=1001, is_training=True, dropout_keep_prob=0.8, reuse=None, scope='InceptionResnetV2', create_aux_logits=True, activation_fn=tf.nn.relu): """Creates the Inception Resnet V2 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. Dimension batch_size may be undefined. If create_aux_logits is false, also height and width may be undefined. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer (before dropout) are returned instead. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. create_aux_logits: Whether to include the auxilliary logits. activation_fn: Activation function for conv2d. Returns: net: the output of the logits layer (if num_classes is a non-zero integer), or the non-dropped-out input to the logits layer (if num_classes is 0 or None). end_points: the set of end_points from the inception model. """ end_points = {} with tf.compat.v1.variable_scope(scope, 'InceptionResnetV2', [inputs], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_resnet_v2_base( inputs, scope=scope, activation_fn=activation_fn) if create_aux_logits and num_classes: with tf.compat.v1.variable_scope('AuxLogits'): aux = end_points['PreAuxLogits'] aux = slim.avg_pool2d(aux, 5, stride=3, padding='VALID', scope='Conv2d_1a_3x3') aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1') aux = slim.conv2d(aux, 768, aux.get_shape()[1:3], padding='VALID', scope='Conv2d_2a_5x5') aux = slim.flatten(aux) aux = slim.fully_connected(aux, num_classes, activation_fn=None, scope='Logits') end_points['AuxLogits'] = aux with tf.compat.v1.variable_scope('Logits'): # TODO(sguada,arnoegw): Consider adding a parameter global_pool which # can be set to False to disable pooling here (as in resnet_*()). kernel_size = net.get_shape()[1:3] if kernel_size.is_fully_defined(): net = slim.avg_pool2d(net, kernel_size, padding='VALID', scope='AvgPool_1a_8x8') else: net = tf.reduce_mean(input_tensor=net, axis=[1, 2], keepdims=True, name='global_pool') end_points['global_pool'] = net if not num_classes: return net, end_points net = slim.flatten(net) net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='Dropout') end_points['PreLogitsFlatten'] = net logits = slim.fully_connected(net, num_classes, activation_fn=None, scope='Logits') end_points['Logits'] = logits end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') return logits, end_points
def style_prediction_mobilenet(style_input_, activation_names, activation_depths, mobilenet_end_point='layer_19', mobilenet_trainable=True, style_params_trainable=False, style_prediction_bottleneck=100, reuse=None): """Maps style images to the style embeddings using MobileNetV2. Args: style_input_: Tensor. Batch of style input images. activation_names: string. Scope names of the activations of the transformer network which are used to apply style normalization. activation_depths: Shapes of the activations of the transformer network which are used to apply style normalization. mobilenet_end_point: string. Specifies the endpoint to construct the MobileNetV2 network up to. This network is part of the style prediction network. mobilenet_trainable: bool. Should the MobileNetV2 parameters be marked as trainable? style_params_trainable: bool. Should the mapping from bottleneck to beta and gamma parameters be marked as trainable? style_prediction_bottleneck: int. Specifies the bottleneck size in the number of parameters of the style embedding. reuse: bool. Whether to reuse model parameters. Defaults to False. Returns: Tensor for the output of the style prediction network, Tensor for the bottleneck of style parameters of the style prediction network. """ with tf.name_scope('style_prediction_mobilenet') and tf.variable_scope( tf.get_variable_scope(), reuse=reuse): with slim.arg_scope( mobilenet_v2.training_scope(is_training=mobilenet_trainable)): _, end_points = mobilenet.mobilenet_base( style_input_, conv_defs=mobilenet_v2.V2_DEF, final_endpoint=mobilenet_end_point, scope='MobilenetV2') feat_convlayer = end_points[mobilenet_end_point] with tf.name_scope('bottleneck'): # (batch_size, 1, 1, depth). bottleneck_feat = tf.reduce_mean(feat_convlayer, axis=[1, 2], keep_dims=True) if style_prediction_bottleneck > 0: with tf.variable_scope('mobilenet_conv'): with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, trainable=mobilenet_trainable): # (batch_size, 1, 1, style_prediction_bottleneck). bottleneck_feat = slim.conv2d(bottleneck_feat, style_prediction_bottleneck, [1, 1]) style_params = {} with tf.variable_scope('style_params'): for i in range(len(activation_depths)): with tf.variable_scope(activation_names[i], reuse=reuse): with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, trainable=style_params_trainable): # Computing beta parameter of the style normalization for the # activation_names[i] layer of the style transformer network. # (batch_size, 1, 1, activation_depths[i]) beta = slim.conv2d(bottleneck_feat, activation_depths[i], [1, 1]) # (batch_size, activation_depths[i]) beta = tf.squeeze(beta, [1, 2], name='SpatialSqueeze') style_params['{}/beta'.format( activation_names[i])] = beta # Computing gamma parameter of the style normalization for the # activation_names[i] layer of the style transformer network. # (batch_size, 1, 1, activation_depths[i]) gamma = slim.conv2d(bottleneck_feat, activation_depths[i], [1, 1]) # (batch_size, activation_depths[i]) gamma = tf.squeeze(gamma, [1, 2], name='SpatialSqueeze') style_params['{}/gamma'.format( activation_names[i])] = gamma return style_params, bottleneck_feat
def inception_resnet_v2_base(inputs, final_endpoint='Conv2d_7b_1x1', output_stride=16, align_feature_maps=False, scope=None, activation_fn=tf.nn.relu): """Inception model from http://arxiv.org/abs/1602.07261. Constructs an Inception Resnet v2 network from inputs to the given final endpoint. This method can construct the network up to the final inception block Conv2d_7b_1x1. Args: inputs: a tensor of size [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1'] output_stride: A scalar that specifies the requested ratio of input to output spatial resolution. Only supports 8 and 16. align_feature_maps: When true, changes all the VALID paddings in the network to SAME padding so that the feature maps are aligned. scope: Optional variable_scope. activation_fn: Activation function for block scopes. Returns: tensor_out: output tensor corresponding to the final_endpoint. end_points: a set of activations for external use, for example summaries or losses. Raises: ValueError: if final_endpoint is not set to one of the predefined values, or if the output_stride is not 8 or 16, or if the output_stride is 8 and we request an end point after 'PreAuxLogits'. """ if output_stride != 8 and output_stride != 16: raise ValueError('output_stride must be 8 or 16.') padding = 'SAME' if align_feature_maps else 'VALID' end_points = {} def add_and_check_final(name, net): end_points[name] = net return name == final_endpoint with tf.compat.v1.variable_scope(scope, 'InceptionResnetV2', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 149 x 149 x 32 net = slim.conv2d(inputs, 32, 3, stride=2, padding=padding, scope='Conv2d_1a_3x3') if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points # 147 x 147 x 32 net = slim.conv2d(net, 32, 3, padding=padding, scope='Conv2d_2a_3x3') if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points # 147 x 147 x 64 net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points # 73 x 73 x 64 net = slim.max_pool2d(net, 3, stride=2, padding=padding, scope='MaxPool_3a_3x3') if add_and_check_final('MaxPool_3a_3x3', net): return net, end_points # 73 x 73 x 80 net = slim.conv2d(net, 80, 1, padding=padding, scope='Conv2d_3b_1x1') if add_and_check_final('Conv2d_3b_1x1', net): return net, end_points # 71 x 71 x 192 net = slim.conv2d(net, 192, 3, padding=padding, scope='Conv2d_4a_3x3') if add_and_check_final('Conv2d_4a_3x3', net): return net, end_points # 35 x 35 x 192 net = slim.max_pool2d(net, 3, stride=2, padding=padding, scope='MaxPool_5a_3x3') if add_and_check_final('MaxPool_5a_3x3', net): return net, end_points # 35 x 35 x 320 with tf.compat.v1.variable_scope('Mixed_5b'): with tf.compat.v1.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1') with tf.compat.v1.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5, scope='Conv2d_0b_5x5') with tf.compat.v1.variable_scope('Branch_2'): tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3, scope='Conv2d_0c_3x3') with tf.compat.v1.variable_scope('Branch_3'): tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME', scope='AvgPool_0a_3x3') tower_pool_1 = slim.conv2d(tower_pool, 64, 1, scope='Conv2d_0b_1x1') net = tf.concat( [tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1], 3) if add_and_check_final('Mixed_5b', net): return net, end_points # TODO(alemi): Register intermediate endpoints net = slim.repeat(net, 10, block35, scale=0.17, activation_fn=activation_fn) # 17 x 17 x 1088 if output_stride == 8, # 33 x 33 x 1088 if output_stride == 16 use_atrous = output_stride == 8 with tf.compat.v1.variable_scope('Mixed_6a'): with tf.compat.v1.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 384, 3, stride=1 if use_atrous else 2, padding=padding, scope='Conv2d_1a_3x3') with tf.compat.v1.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3, scope='Conv2d_0b_3x3') tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3, stride=1 if use_atrous else 2, padding=padding, scope='Conv2d_1a_3x3') with tf.compat.v1.variable_scope('Branch_2'): tower_pool = slim.max_pool2d(net, 3, stride=1 if use_atrous else 2, padding=padding, scope='MaxPool_1a_3x3') net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3) if add_and_check_final('Mixed_6a', net): return net, end_points # TODO(alemi): register intermediate endpoints with slim.arg_scope([slim.conv2d], rate=2 if use_atrous else 1): net = slim.repeat(net, 20, block17, scale=0.10, activation_fn=activation_fn) if add_and_check_final('PreAuxLogits', net): return net, end_points if output_stride == 8: # TODO(gpapan): Properly support output_stride for the rest of the net. raise ValueError( 'output_stride==8 is only supported up to the ' 'PreAuxlogits end_point for now.') # 8 x 8 x 2080 with tf.compat.v1.variable_scope('Mixed_7a'): with tf.compat.v1.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2, padding=padding, scope='Conv2d_1a_3x3') with tf.compat.v1.variable_scope('Branch_1'): tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2, padding=padding, scope='Conv2d_1a_3x3') with tf.compat.v1.variable_scope('Branch_2'): tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2, padding=padding, scope='Conv2d_1a_3x3') with tf.compat.v1.variable_scope('Branch_3'): tower_pool = slim.max_pool2d(net, 3, stride=2, padding=padding, scope='MaxPool_1a_3x3') net = tf.concat( [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool], 3) if add_and_check_final('Mixed_7a', net): return net, end_points # TODO(alemi): register intermediate endpoints net = slim.repeat(net, 9, block8, scale=0.20, activation_fn=activation_fn) net = block8(net, activation_fn=None) # 8 x 8 x 1536 net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1') if add_and_check_final('Conv2d_7b_1x1', net): return net, end_points raise ValueError('final_endpoint (%s) not recognized', final_endpoint)
def predict(self, inputs, **kwargs): """Predicts the resulting tensors. Args: inputs: A dictionary of input tensors keyed by names. Returns: predictions: A dictionary of prediction tensors keyed by name. """ predictions = {} is_training = self._is_training options = self._model_proto # Decode image fields from `inputs`. (image, image_height, image_width, num_detections, detection_boxes, detection_classes, detection_scores) = ( inputs[InputFields.img_data], inputs[InputFields.img_height], inputs[InputFields.img_width], inputs[InputFields.num_detections], inputs[InputFields.detection_boxes], inputs[InputFields.detection_classes], inputs[InputFields.detection_scores], ) batch_size = image.shape[0] (max_num_detections, num_detections, detection_boxes, detection_classes, detection_scores) = remove_detections( num_detections, detection_boxes, detection_classes, detection_scores, max_num_detections=options.max_num_detections) # Extract Fast-RCNN features. image_batch_shape = tf.shape(image) detection_boxes = convert_to_batch_coordinates(detection_boxes, image_height, image_width, image_batch_shape[1], image_batch_shape[2]) detection_features, _ = fast_rcnn.FastRCNN( image, detection_boxes, options=options.fast_rcnn_config, is_training=is_training) predictions.update({'detection_features': detection_features}) with slim.arg_scope(self._slim_fc_scope): detection_features = self.project_detection_features( detection_features) # Ground objects. (choice_ids, choice_tag_ids, choice_lengths) = (inputs[self._field_choices], inputs[self._field_choices_tag], inputs[self._field_choices_len]) choice_tag_ids = preprocess_tags(choice_tag_ids, max_num_detections) choice_tag_features = ground_detection_features( detection_features, choice_tag_ids) # Create BERT prediction. choice_ids_list = tf.unstack(choice_ids, axis=1) choice_tag_ids_list = tf.unstack(choice_tag_ids, axis=1) choice_tag_features_list = tf.unstack(choice_tag_features, axis=1) choice_lengths_list = tf.unstack(choice_lengths, axis=1) reuse = False feature_to_predict_choices = [] for caption_ids, caption_tag_ids, caption_tag_features, caption_length in zip( choice_ids_list, choice_tag_ids_list, choice_tag_features_list, choice_lengths_list): with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): bert_output, embedding_table = self.image_text_matching( num_detections, detection_boxes, detection_classes, detection_scores, detection_features, caption_ids, caption_tag_ids, caption_tag_features, caption_length) feature_to_predict_choices.append(bert_output) reuse = True # Predicting the answer. with slim.arg_scope(self._slim_fc_scope): features = tf.stack(feature_to_predict_choices, 1) logits = slim.fully_connected(features, num_outputs=1, activation_fn=None, scope='itm/logits') predictions.update({'answer_prediction': tf.squeeze(logits, -1)}) # Restore from BERT checkpoint. assignment_map, _ = checkpoints.get_assignment_map_from_checkpoint( [x for x in tf.global_variables() if x.op.name.startswith('bert') ], # IMPORTANT to filter using `bert`. options.bert_checkpoint_file) tf.train.init_from_checkpoint(options.bert_checkpoint_file, assignment_map) return predictions
def inception_resnet_v2(inputs, is_training=True, dropout_keep_prob=0.8, bottleneck_layer_size=128, reuse=None, scope='InceptionResnetV2'): """Creates the Inception Resnet V2 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. """ end_points = {} with tf.variable_scope(scope, 'InceptionResnetV2', [inputs], reuse=reuse): with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 149 x 149 x 32 net = slim.conv2d(inputs, 32, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') end_points['Conv2d_1a_3x3'] = net # 147 x 147 x 32 net = slim.conv2d(net, 32, 3, padding='VALID', scope='Conv2d_2a_3x3') end_points['Conv2d_2a_3x3'] = net # 147 x 147 x 64 net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') end_points['Conv2d_2b_3x3'] = net # 73 x 73 x 64 net = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_3a_3x3') end_points['MaxPool_3a_3x3'] = net # 73 x 73 x 80 net = slim.conv2d(net, 80, 1, padding='VALID', scope='Conv2d_3b_1x1') end_points['Conv2d_3b_1x1'] = net # 71 x 71 x 192 net = slim.conv2d(net, 192, 3, padding='VALID', scope='Conv2d_4a_3x3') end_points['Conv2d_4a_3x3'] = net # 35 x 35 x 192 net = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_5a_3x3') end_points['MaxPool_5a_3x3'] = net # 35 x 35 x 320 with tf.variable_scope('Mixed_5b'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5, scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3, scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME', scope='AvgPool_0a_3x3') tower_pool_1 = slim.conv2d(tower_pool, 64, 1, scope='Conv2d_0b_1x1') net = tf.concat([ tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1 ], 3) end_points['Mixed_5b'] = net net = slim.repeat(net, 10, block35, scale=0.17) # 17 x 17 x 1024 with tf.variable_scope('Mixed_6a'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3, scope='Conv2d_0b_3x3') tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3) end_points['Mixed_6a'] = net net = slim.repeat(net, 20, block17, scale=0.10) with tf.variable_scope('Mixed_7a'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_3'): tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat([ tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool ], 3) end_points['Mixed_7a'] = net net = slim.repeat(net, 9, block8, scale=0.20) net = block8(net, activation_fn=None) net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1') end_points['Conv2d_7b_1x1'] = net with tf.variable_scope('Logits'): end_points['PrePool'] = net #pylint: disable=no-member net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID', scope='AvgPool_1a_8x8') net = slim.flatten(net) net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='Dropout') end_points['PreLogitsFlatten'] = net net = slim.fully_connected(net, bottleneck_layer_size, activation_fn=None, scope='Bottleneck', reuse=False) return net, end_points
def inception_v4(inputs, num_classes=1001, is_training=True, dropout_keep_prob=0.8, reuse=None, scope='InceptionV4', create_aux_logits=True): """Creates the Inception V4 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer (before dropout) are returned instead. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. create_aux_logits: Whether to include the auxiliary logits. Returns: net: a Tensor with the logits (pre-softmax activations) if num_classes is a non-zero integer, or the non-dropped input to the logits layer if num_classes is 0 or None. end_points: the set of end_points from the inception model. """ end_points = {} with tf.variable_scope(scope, 'InceptionV4', [inputs], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v4_base(inputs, scope=scope) # with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], # stride=1, padding='SAME'): # # Auxiliary Head logits # if create_aux_logits and num_classes: # with tf.variable_scope('AuxLogits'): # # 17 x 17 x 1024 # aux_logits = end_points['Mixed_6h'] # aux_logits = slim.avg_pool2d(aux_logits, [5, 5], stride=3, # padding='VALID', # scope='AvgPool_1a_5x5') # aux_logits = slim.conv2d(aux_logits, 128, [1, 1], # scope='Conv2d_1b_1x1') # aux_logits = slim.conv2d(aux_logits, 768, # aux_logits.get_shape()[1:3], # padding='VALID', scope='Conv2d_2a') # aux_logits = slim.flatten(aux_logits) # aux_logits = slim.fully_connected(aux_logits, num_classes, # activation_fn=None, # scope='Aux_logits') # end_points['AuxLogits'] = aux_logits # # Final pooling and prediction # # TODO(sguada,arnoegw): Consider adding a parameter global_pool which # # can be set to False to disable pooling here (as in resnet_*()). # with tf.variable_scope('Logits'): # # 8 x 8 x 1536 # kernel_size = net.get_shape()[1:3] # if kernel_size.is_fully_defined(): # net = slim.avg_pool2d(net, kernel_size, padding='VALID', # scope='AvgPool_1a') # else: # net = tf.reduce_mean(net, [1, 2], keep_dims=True, # name='global_pool') # end_points['global_pool'] = net # if not num_classes: # return net, end_points # # 1 x 1 x 1536 # net = slim.dropout(net, dropout_keep_prob, scope='Dropout_1b') # net = slim.flatten(net, scope='PreLogitsFlatten') # end_points['PreLogitsFlatten'] = net # # 1536 # logits = slim.fully_connected(net, num_classes, activation_fn=None, # scope='Logits') # end_points['Logits'] = logits # end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') return net, end_points
def encode_effect(states, contexts, use_relation, use_point_cloud, dim_fc_state, dim_fc_context): """Encode the effect feature. Args: states: The state as a dict. contexts: The context data. Set to None if no contexts are used. use_relation: True if use relation encoding. use_point_cloud: True if point cloud data is used. dim_fc_state: Dimension of state encoding. dim_fc_context: Dimension of context encoding. Returns: A tensor of shape [batch_size, dim_fc_state]. """ positions = states['position'] body_masks = states['body_mask'] num_bodies = int(body_masks.shape[-1]) with slim.arg_scope([slim.fully_connected], activation_fn=tf.nn.relu, normalizer_fn=NORMALIZER_FN, normalizer_params=NORMALIZER_PARAMS): features = [] with tf.compat.v1.variable_scope('encode_position'): position_feats = slim.fully_connected(positions, dim_fc_state, scope='fc') features.append(position_feats) if use_relation: with tf.compat.v1.variable_scope('encode_relation'): relation_feats = encode_relation(positions, body_masks, dim_fc_state=dim_fc_state) features.append(relation_feats) if use_point_cloud: cloud_feats = states['cloud_feat'] features.append(cloud_feats) if contexts is not None: with tf.compat.v1.variable_scope('encode_context'): context_feats = slim.fully_connected(contexts, dim_fc_context, scope='fc') context_feats = tf.tile(tf.expand_dims(context_feats, 1), [1, num_bodies, 1]) features.append(context_feats) net = tf.concat(features, axis=-1) net = slim.repeat(net, 2, slim.fully_connected, dim_fc_state, scope='fc') effects = tf.identity(net, 'effects') return effects
def prediction_layers( self, features, end_points, input_shape, scope="pose", reuse=None, ): net_type = self.cfg["net_type"] if self.cfg["multi_stage"]: # MuNet! (multi_stage decoder + multi_fusion) # Defining multi_fusion backbone num_layers = re.findall("resnet_([0-9]*)", net_type)[0] layer_name = ( "resnet_v1_{}".format(num_layers) + "/block{}/unit_{}/bottleneck_v1" ) mid_pt_block1 = layer_name.format(1, 3) mid_pt_block2 = layer_name.format(2, 3) final_dims = tf.math.ceil( tf.divide(input_shape[1:3], tf.convert_to_tensor(16)) ) interim_dims_s8 = tf.scalar_mul(2, final_dims) interim_dims_s8 = tf.cast(interim_dims_s8, tf.int32) interim_dims_s4 = tf.scalar_mul(2, interim_dims_s8) interim_dims_s4 = tf.cast(interim_dims_s4, tf.int32) bank_1 = end_points[mid_pt_block1] bank_2 = end_points[mid_pt_block2] bank_2_s8 = tf.compat.v1.image.resize_images(bank_2, interim_dims_s8) bank_1_s4 = tf.compat.v1.image.resize_images(bank_1, interim_dims_s4) with slim.arg_scope( [slim.conv2d], padding="SAME", normalizer_fn=slim.layers.batch_norm, activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer(self.cfg["weight_decay"]), ): with tf.compat.v1.variable_scope("decoder_filters"): bank_2_s16 = slim.conv2d( bank_2_s8, 512, kernel_size=[3, 3], stride=2, scope="decoder_parallel_1", ) bank_2_s16 = slim.conv2d( bank_2_s16, 128, kernel_size=[1, 1], stride=1, scope="decoder_parallel_2", ) bank_1_s8 = slim.conv2d( bank_1_s4, 256, kernel_size=[3, 3], stride=2, scope="decoder_parallel_3", ) bank_1_s16 = slim.conv2d( bank_1_s8, 256, kernel_size=[3, 3], stride=2, scope="decoder_parallel_4", ) bank_1_s16 = slim.conv2d( bank_1_s16, 128, kernel_size=[1, 1], stride=1, scope="decoder_parallel_5", ) with slim.arg_scope( [slim.conv2d_transpose], padding="SAME", normalizer_fn=None, weights_regularizer=slim.l2_regularizer(self.cfg["weight_decay"]), ): with tf.compat.v1.variable_scope("upsampled_features"): concat_3_s16 = tf.concat([bank_1_s16, bank_2_s16, features], 3) if self.cfg["stride"] == 8: net = concat_3_s16 elif self.cfg["stride"] == 4: upsampled_features_2x = slim.conv2d_transpose( concat_3_s16, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2, scope="block3", ) net = upsampled_features_2x elif self.cfg["stride"] == 2: upsampled_features_2x = slim.conv2d_transpose( concat_3_s16, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2, scope="block3", ) upsampled_features_4x = slim.conv2d_transpose( upsampled_features_2x, self.cfg.get("bank5", 128), kernel_size=[3, 3], stride=2, scope="block4", ) net = upsampled_features_4x out = {} # Attaching multi-stage decoder with tf.compat.v1.variable_scope(scope, reuse=reuse): stage1_hm_out = prediction_layer( self.cfg, net, "part_pred_s1", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) if self.cfg["location_refinement"]: out["locref"] = prediction_layer( self.cfg, net, "locref_pred", self.cfg["num_joints"] * 2 ) if ( self.cfg["pairwise_predict"] and "multi-animal" not in self.cfg["dataset_type"] ): out["pairwise_pred"] = prediction_layer( self.cfg, net, "pairwise_pred", self.cfg["num_joints"] * (self.cfg["num_joints"] - 1) * 2, ) if ( self.cfg["partaffinityfield_predict"] and "multi-animal" in self.cfg["dataset_type"] ): feature = slim.conv2d_transpose( net, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2 ) stage1_paf_out = prediction_layer( self.cfg, net, "pairwise_pred_s1", self.cfg["num_limbs"] * 2 ) stage2_in = tf.concat([stage1_hm_out, stage1_paf_out, feature], 3) stage_input = stage2_in stage_paf_output = stage1_paf_out stage_hm_output = stage1_hm_out for i in range(2, 5): pre_stage_paf_output = stage_paf_output pre_stage_hm_output = stage_hm_output stage_paf_output = prediction_layer_stage( self.cfg, stage_input, f"pairwise_pred_s{i}", self.cfg["num_limbs"] * 2, ) stage_hm_output = prediction_layer_stage( self.cfg, stage_input, f"part_pred_s{i}", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) if i > 2: # stage_paf_output = stage_paf_output + pre_stage_paf_output stage_hm_output = stage_hm_output + pre_stage_hm_output stage_input = tf.concat( [stage_hm_output, stage_paf_output, feature], 3 ) out["part_pred"] = prediction_layer_stage( self.cfg, stage_input, "part_pred", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) out["pairwise_pred"] = prediction_layer_stage( self.cfg, stage_input, "pairwise_pred", self.cfg["num_limbs"] * 2, ) if self.cfg["intermediate_supervision"]: interm_name = layer_name.format( 3, self.cfg["intermediate_supervision_layer"] ) block_interm_out = end_points[interm_name] out["part_pred_interm"] = prediction_layer( self.cfg, block_interm_out, "intermediate_supervision", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) else: # dual fusion net (for stride 4 experiments) if "resnet" in net_type: num_layers = re.findall("resnet_([0-9]*)", net_type)[0] layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1" mid_pt = layer_name.format(num_layers, 2, 3) elif "mobilenet" in net_type: mid_pt = "layer_7" elif "efficientnet" in net_type: mid_pt = f"block_{parallel_layers[net_type.split('-')[1]]}" else: raise ValueError(f"Unknown network of type {net_type}") final_dims = tf.math.ceil( tf.divide(input_shape[1:3], tf.convert_to_tensor(value=16)) ) interim_dims = tf.scalar_mul(2, final_dims) interim_dims = tf.cast(interim_dims, tf.int32) bank_3 = end_points[mid_pt] bank_3 = tf.image.resize(bank_3, interim_dims) with slim.arg_scope( [slim.conv2d], padding="SAME", normalizer_fn=None, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (self.cfg["weight_decay"]) ), ): with tf.compat.v1.variable_scope("decoder_filters"): bank_3 = slim.conv2d( bank_3, self.cfg.get("bank3", 128), 1, scope="decoder_parallel_1", ) with slim.arg_scope( [slim.conv2d_transpose], padding="SAME", normalizer_fn=None, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (self.cfg["weight_decay"]) ), ): with tf.compat.v1.variable_scope("upsampled_features"): upsampled_features = slim.conv2d_transpose( features, self.cfg.get("bank5", 128), kernel_size=[3, 3], stride=2, scope="block4", ) net = tf.concat([bank_3, upsampled_features], 3) out = super(PoseMultiNet, self).prediction_layers( net, scope, reuse, ) with tf.compat.v1.variable_scope(scope, reuse=reuse): if ( self.cfg["intermediate_supervision"] and "efficientnet" not in net_type ): if "mobilenet" in net_type: feat = end_points[ f"layer_{self.cfg['intermediate_supervision_layer']}" ] elif "resnet" in net_type: layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1" num_layers = re.findall("resnet_([0-9]*)", net_type)[0] interm_name = layer_name.format( num_layers, 3, self.cfg["intermediate_supervision_layer"] ) feat = end_points[interm_name] else: return out pred_layer = out["part_pred_interm"] = prediction_layer( self.cfg, feat, "intermediate_supervision", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) out["part_pred_interm"] = pred_layer return out
def conv_hyperparams_fn(): with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm), \ slim.arg_scope([slim.batch_norm], is_training=False) as sc: return sc
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope( scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope( [slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. batch_size = net.shape[0].value // unroll_length with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell, init_state, _ = self.create_lstm_cell( batch_size, (net.shape[1].value, net.shape[2].value), state_saver, state_name, dtype=preprocessed_inputs.dtype) net_seq = list(tf.split(net, unroll_length)) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step + 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return list(feature_maps.values())
def conv_hyperparams_fn(): with (slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm) and slim.arg_scope( [slim.batch_norm], decay=0.97, epsilon=1e-3)) as sc: return sc
def resnet_v1(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, spatial_squeeze=True, reuse=None, scope=None): """Generator for v1 ResNet models. This function generates a family of ResNet v1 models. See the resnet_v1_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether is training or not. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. spatial_squeeze: if True, logits is of shape [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with tf.compat.v1.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with slim.arg_scope([slim.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net) net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) end_points = slim.utils.convert_collection_to_dict(end_points_collection) # end_points['pool2'] = end_points['resnet_v1_50/pool1/MaxPool:0'] try: end_points['pool3'] = end_points['resnet_v1_50/block1'] end_points['pool4'] = end_points['resnet_v1_50/block2'] except: end_points['pool3'] = end_points['Detection/resnet_v1_50/block1'] end_points['pool4'] = end_points['Detection/resnet_v1_50/block2'] end_points['pool5'] = net # if global_pool: # # Global average pooling. # net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) # if num_classes is not None: # net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, # normalizer_fn=None, scope='logits') # if spatial_squeeze: # logits = tf.squeeze(net, [1, 2], name='SpatialSqueeze') # else: # logits = net # # Convert end_points_collection into a dictionary of end_points. # end_points = slim.utils.convert_collection_to_dict(end_points_collection) # if num_classes is not None: # end_points['predictions'] = slim.softmax(logits, scope='predictions') return net, end_points
def extract_features(images, model_options, weight_decay=0.0001, reuse=None, is_training=False, fine_tune_batch_norm=False, nas_training_hyper_parameters=None): """Extracts features by the particular model_variant. Args: images: A tensor of size [batch, height, width, channels]. model_options: A ModelOptions instance to configure models. weight_decay: The weight decay for model variables. reuse: Reuse the model variables or not. is_training: Is training or not. fine_tune_batch_norm: Fine-tune the batch norm parameters or not. nas_training_hyper_parameters: A dictionary storing hyper-parameters for training nas models. Its keys are: - `drop_path_keep_prob`: Probability to keep each path in the cell when training. - `total_training_steps`: Total training steps to help drop path probability calculation. Returns: concat_logits: A tensor of size [batch, feature_height, feature_width, feature_channels], where feature_height/feature_width are determined by the images height/width and output_stride. end_points: A dictionary from components of the network to the corresponding activation. """ features, end_points = feature_extractor.extract_features( images, output_stride=model_options.output_stride, multi_grid=model_options.multi_grid, model_variant=model_options.model_variant, depth_multiplier=model_options.depth_multiplier, divisible_by=model_options.divisible_by, weight_decay=weight_decay, reuse=reuse, is_training=is_training, preprocessed_images_dtype=model_options.preprocessed_images_dtype, fine_tune_batch_norm=fine_tune_batch_norm, nas_architecture_options=model_options.nas_architecture_options, nas_training_hyper_parameters=nas_training_hyper_parameters, use_bounded_activation=model_options.use_bounded_activation) if not model_options.aspp_with_batch_norm: return features, end_points else: if model_options.dense_prediction_cell_config is not None: tf.logging.info('Using dense prediction cell config.') dense_prediction_layer = dense_prediction_cell.DensePredictionCell( config=model_options.dense_prediction_cell_config, hparams={ 'conv_rate_multiplier': 16 // model_options.output_stride, }) concat_logits = dense_prediction_layer.build_cell( features, output_stride=model_options.output_stride, crop_size=model_options.crop_size, image_pooling_crop_size=model_options.image_pooling_crop_size, weight_decay=weight_decay, reuse=reuse, is_training=is_training, fine_tune_batch_norm=fine_tune_batch_norm) return concat_logits, end_points else: # The following codes employ the DeepLabv3 ASPP module. Note that we # could express the ASPP module as one particular dense prediction # cell architecture. We do not do so but leave the following codes # for backward compatibility. batch_norm_params = utils.get_batch_norm_params( decay=0.9997, epsilon=1e-5, scale=True, is_training=(is_training and fine_tune_batch_norm), sync_batch_norm_method=model_options.sync_batch_norm_method) batch_norm = utils.get_batch_norm_fn( model_options.sync_batch_norm_method) activation_fn = (tf.nn.relu6 if model_options.use_bounded_activation else tf.nn.relu) with slim.arg_scope( [slim.conv2d, slim.separable_conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), activation_fn=activation_fn, normalizer_fn=batch_norm, padding='SAME', stride=1, reuse=reuse): with slim.arg_scope([batch_norm], **batch_norm_params): depth = model_options.aspp_convs_filters branch_logits = [] if model_options.add_image_level_feature: if model_options.crop_size is not None: image_pooling_crop_size = model_options.image_pooling_crop_size # If image_pooling_crop_size is not specified, use crop_size. if image_pooling_crop_size is None: image_pooling_crop_size = model_options.crop_size pool_height = scale_dimension( image_pooling_crop_size[0], 1. / model_options.output_stride) pool_width = scale_dimension( image_pooling_crop_size[1], 1. / model_options.output_stride) image_feature = slim.avg_pool2d( features, [pool_height, pool_width], model_options.image_pooling_stride, padding='VALID') resize_height = scale_dimension( model_options.crop_size[0], 1. / model_options.output_stride) resize_width = scale_dimension( model_options.crop_size[1], 1. / model_options.output_stride) else: # If crop_size is None, we simply do global pooling. pool_height = tf.shape(features)[1] pool_width = tf.shape(features)[2] image_feature = tf.reduce_mean(features, axis=[1, 2], keepdims=True) resize_height = pool_height resize_width = pool_width image_feature_activation_fn = tf.nn.relu image_feature_normalizer_fn = batch_norm if model_options.aspp_with_squeeze_and_excitation: image_feature_activation_fn = tf.nn.sigmoid if model_options.image_se_uses_qsigmoid: image_feature_activation_fn = utils.q_sigmoid image_feature_normalizer_fn = None image_feature = slim.conv2d( image_feature, depth, 1, activation_fn=image_feature_activation_fn, normalizer_fn=image_feature_normalizer_fn, scope=IMAGE_POOLING_SCOPE) image_feature = _resize_bilinear( image_feature, [resize_height, resize_width], image_feature.dtype) # Set shape for resize_height/resize_width if they are not Tensor. if isinstance(resize_height, tf.Tensor): resize_height = None if isinstance(resize_width, tf.Tensor): resize_width = None image_feature.set_shape( [None, resize_height, resize_width, depth]) if not model_options.aspp_with_squeeze_and_excitation: branch_logits.append(image_feature) # Employ a 1x1 convolution. branch_logits.append( slim.conv2d(features, depth, 1, scope=ASPP_SCOPE + str(0))) if model_options.atrous_rates: # Employ 3x3 convolutions with different atrous rates. for i, rate in enumerate(model_options.atrous_rates, 1): scope = ASPP_SCOPE + str(i) if model_options.aspp_with_separable_conv: aspp_features = split_separable_conv2d( features, filters=depth, rate=rate, weight_decay=weight_decay, scope=scope) else: aspp_features = slim.conv2d(features, depth, 3, rate=rate, scope=scope) branch_logits.append(aspp_features) # Merge branch logits. concat_logits = tf.concat(branch_logits, 3) if model_options.aspp_with_concat_projection: concat_logits = slim.conv2d( concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE) concat_logits = slim.dropout( concat_logits, keep_prob=0.9, is_training=is_training, scope=CONCAT_PROJECTION_SCOPE + '_dropout') if (model_options.add_image_level_feature and model_options.aspp_with_squeeze_and_excitation): concat_logits *= image_feature return concat_logits, end_points
def encoder(images, style_size=8, keep_prob=1.0, phase_train=True, weight_decay=0.0, reuse=None, scope='Encoders'): with tf.compat.v1.variable_scope(scope, reuse=reuse): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected], activation_fn=tf.nn.relu, # weights_initializer=tf.contrib.layers.xavier_initializer(), weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=2.0), weights_regularizer=tf.keras.regularizers.l2(0.5 * (weight_decay))): with slim.arg_scope([slim.dropout, slim.batch_norm], is_training=phase_train): with slim.arg_scope([slim.fully_connected], normalizer_fn=layer_norm, normalizer_params=None): print('{} input shape:'.format(scope), [dim.value for dim in images.shape]) batch_size = tf.shape(input=images)[0] k = 64 with tf.compat.v1.variable_scope('StyleEncoder'): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected], normalizer_fn=None, normalizer_params=None): print('-- StyleEncoder') net = images net = conv(net, k, 7, stride=1, pad=3, scope='conv0') print('module conv0 shape:', [dim.value for dim in net.shape]) net = conv(net, 2*k, 4, stride=2, scope='conv1') print('module conv1 shape:', [dim.value for dim in net.shape]) net = conv(net, 4*k, 4, stride=2, scope='conv2') print('module conv2 shape:', [dim.value for dim in net.shape]) encoded_style = net net = slim.avg_pool2d(net, net.shape[1:3], padding='VALID', scope='global_pool') net = slim.flatten(net) style_vec = slim.fully_connected(net, style_size, activation_fn=None, normalizer_fn=None, scope='fc1') print('module fc1 shape:', [dim.value for dim in net.shape]) style_vec = tf.identity(style_vec, name='style_vec') # Transform textures with tf.compat.v1.variable_scope('ContentEncoder'): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected], normalizer_fn=instance_norm, normalizer_params=None): print('-- ContentEncoder') net = images net = conv(net, k, 7, stride=1, pad=3, scope='conv0') print('module conv0 shape:', [dim.value for dim in net.shape]) net = conv(net, 2*k, 4, stride=2, scope='conv1') print('module conv1 shape:', [dim.value for dim in net.shape]) net = conv(net, 4*k, 4, stride=2, scope='conv2') print('module conv2 shape:', [dim.value for dim in net.shape]) for i in range(3): net_ = conv(net, 4*k, 3, scope='res{}_0'.format(i)) net += conv(net_, 4*k, 3, activation_fn=None, biases_initializer=None, scope='res{}_1'.format(i)) print('module res{} shape:'.format(i), [dim.value for dim in net.shape]) encoded = net return encoded, style_vec
def refine_by_decoder(features, end_points, crop_size=None, decoder_output_stride=None, decoder_use_separable_conv=False, decoder_use_sum_merge=False, decoder_filters=256, decoder_output_is_logits=False, model_variant=None, weight_decay=0.0001, reuse=None, is_training=False, fine_tune_batch_norm=False, use_bounded_activation=False, sync_batch_norm_method='None'): """Adds the decoder to obtain sharper segmentation results. Args: features: A tensor of size [batch, features_height, features_width, features_channels]. end_points: A dictionary from components of the network to the corresponding activation. crop_size: A tuple [crop_height, crop_width] specifying whole patch crop size. decoder_output_stride: A list of integers specifying the output stride of low-level features used in the decoder module. decoder_use_separable_conv: Employ separable convolution for decoder or not. decoder_use_sum_merge: Boolean, decoder uses simple sum merge or not. decoder_filters: Integer, decoder filter size. decoder_output_is_logits: Boolean, using decoder output as logits or not. model_variant: Model variant for feature extraction. weight_decay: The weight decay for model variables. reuse: Reuse the model variables or not. is_training: Is training or not. fine_tune_batch_norm: Fine-tune the batch norm parameters or not. use_bounded_activation: Whether or not to use bounded activations. Bounded activations better lend themselves to quantized inference. sync_batch_norm_method: String, method used to sync batch norm. Currently only support `None` (no sync batch norm) and `tpu` (use tpu code to sync batch norm). Returns: Decoder output with size [batch, decoder_height, decoder_width, decoder_channels]. Raises: ValueError: If crop_size is None. """ if crop_size is None: raise ValueError('crop_size must be provided when using decoder.') batch_norm_params = utils.get_batch_norm_params( decay=0.9997, epsilon=1e-5, scale=True, is_training=(is_training and fine_tune_batch_norm), sync_batch_norm_method=sync_batch_norm_method) batch_norm = utils.get_batch_norm_fn(sync_batch_norm_method) decoder_depth = decoder_filters projected_filters = 48 if decoder_use_sum_merge: # When using sum merge, the projected filters must be equal to decoder # filters. projected_filters = decoder_filters if decoder_output_is_logits: # Overwrite the setting when decoder output is logits. activation_fn = None normalizer_fn = None conv2d_kernel = 1 # Use original conv instead of separable conv. decoder_use_separable_conv = False else: # Default setting when decoder output is not logits. activation_fn = tf.nn.relu6 if use_bounded_activation else tf.nn.relu normalizer_fn = batch_norm conv2d_kernel = 3 with slim.arg_scope([slim.conv2d, slim.separable_conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), activation_fn=activation_fn, normalizer_fn=normalizer_fn, padding='SAME', stride=1, reuse=reuse): with slim.arg_scope([batch_norm], **batch_norm_params): with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]): decoder_features = features decoder_stage = 0 scope_suffix = '' for output_stride in decoder_output_stride: feature_list = feature_extractor.networks_to_feature_maps[ model_variant][feature_extractor. DECODER_END_POINTS][output_stride] # If only one decoder stage, we do not change the scope name in # order for backward compactibility. if decoder_stage: scope_suffix = '_{}'.format(decoder_stage) for i, name in enumerate(feature_list): decoder_features_list = [decoder_features] # MobileNet and NAS variants use different naming convention. if ('mobilenet' in model_variant or model_variant.startswith('mnas') or model_variant.startswith('nas')): feature_name = name else: feature_name = '{}/{}'.format( feature_extractor.name_scope[model_variant], name) decoder_features_list.append( slim.conv2d(end_points[feature_name], projected_filters, 1, scope='feature_projection' + str(i) + scope_suffix)) # Determine the output size. decoder_height = scale_dimension( crop_size[0], 1.0 / output_stride) decoder_width = scale_dimension( crop_size[1], 1.0 / output_stride) # Resize to decoder_height/decoder_width. for j, feature in enumerate(decoder_features_list): decoder_features_list[j] = _resize_bilinear( feature, [decoder_height, decoder_width], feature.dtype) h = (None if isinstance(decoder_height, tf.Tensor) else decoder_height) w = (None if isinstance(decoder_width, tf.Tensor) else decoder_width) decoder_features_list[j].set_shape( [None, h, w, None]) if decoder_use_sum_merge: decoder_features = _decoder_with_sum_merge( decoder_features_list, decoder_depth, conv2d_kernel=conv2d_kernel, decoder_use_separable_conv= decoder_use_separable_conv, weight_decay=weight_decay, scope_suffix=scope_suffix) else: if not decoder_use_separable_conv: scope_suffix = str(i) + scope_suffix decoder_features = _decoder_with_concat_merge( decoder_features_list, decoder_depth, decoder_use_separable_conv= decoder_use_separable_conv, weight_decay=weight_decay, scope_suffix=scope_suffix) decoder_stage += 1 return decoder_features
def deconv(x, *args, pad=1, **kwargs): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], padding='VALID'): x = padding(x, pad) return slim.conv2d_transpose(x, *args, **kwargs)
def get_branch_logits(features, num_classes, atrous_rates=None, aspp_with_batch_norm=False, kernel_size=1, weight_decay=0.0001, reuse=None, scope_suffix=''): """Gets the logits from each model's branch. The underlying model is branched out in the last layer when atrous spatial pyramid pooling is employed, and all branches are sum-merged to form the final logits. Args: features: A float tensor of shape [batch, height, width, channels]. num_classes: Number of classes to predict. atrous_rates: A list of atrous convolution rates for last layer. aspp_with_batch_norm: Use batch normalization layers for ASPP. kernel_size: Kernel size for convolution. weight_decay: Weight decay for the model variables. reuse: Reuse model variables or not. scope_suffix: Scope suffix for the model variables. Returns: Merged logits with shape [batch, height, width, num_classes]. Raises: ValueError: Upon invalid input kernel_size value. """ # When using batch normalization with ASPP, ASPP has been applied before # in extract_features, and thus we simply apply 1x1 convolution here. if aspp_with_batch_norm or atrous_rates is None: if kernel_size != 1: raise ValueError( 'Kernel size must be 1 when atrous_rates is None or ' 'using aspp_with_batch_norm. Gets %d.' % kernel_size) atrous_rates = [1] with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=tf.truncated_normal_initializer(stddev=0.01), reuse=reuse): with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]): branch_logits = [] for i, rate in enumerate(atrous_rates): scope = scope_suffix if i: scope += '_%d' % i branch_logits.append( slim.conv2d(features, num_classes, kernel_size=kernel_size, rate=rate, activation_fn=None, normalizer_fn=None, scope=scope)) return tf.add_n(branch_logits)
def conv_hyperparams_fn(self): with slim.arg_scope([]) as sc: return sc
def inception_v4_base(inputs, final_endpoint='Mixed_7d', scope=None): """Creates the Inception V4 network up to the given final endpoint. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d'] scope: Optional variable_scope. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. Raises: ValueError: if final_endpoint is not set to one of the predefined values, """ end_points = {} def add_and_check_final(name, net): end_points[name] = net return name == final_endpoint with tf.variable_scope(scope, 'InceptionV4', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 299 x 299 x 3 net = slim.conv2d(inputs, 32, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') end_points["pool1"] = net if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points # 149 x 149 x 32 net = slim.conv2d(net, 32, [3, 3], padding='VALID', scope='Conv2d_2a_3x3') if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points # 147 x 147 x 32 net = slim.conv2d(net, 64, [3, 3], scope='Conv2d_2b_3x3') if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points # 147 x 147 x 64 with tf.variable_scope('Mixed_3a'): with tf.variable_scope('Branch_0'): branch_0 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_0a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [3, 3], stride=2, padding='VALID', scope='Conv2d_0a_3x3') net = tf.concat(axis=3, values=[branch_0, branch_1]) end_points["pool2"] = net if add_and_check_final('Mixed_3a', net): return net, end_points # 73 x 73 x 160 with tf.variable_scope('Mixed_4a'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, 96, [3, 3], padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 64, [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, 64, [7, 1], scope='Conv2d_0c_7x1') branch_1 = slim.conv2d(branch_1, 96, [3, 3], padding='VALID', scope='Conv2d_1a_3x3') net = tf.concat(axis=3, values=[branch_0, branch_1]) if add_and_check_final('Mixed_4a', net): return net, end_points # 71 x 71 x 192 with tf.variable_scope('Mixed_5a'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 192, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(axis=3, values=[branch_0, branch_1]) end_points["pool3"] = net if add_and_check_final('Mixed_5a', net): return net, end_points # 35 x 35 x 384 # 4 x Inception-A blocks for idx in range(4): block_scope = 'Mixed_5' + chr(ord('b') + idx) net = block_inception_a(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points # 35 x 35 x 384 # Reduction-A block net = block_reduction_a(net, 'Mixed_6a') end_points["pool4"] = net if add_and_check_final('Mixed_6a', net): return net, end_points # 17 x 17 x 1024 # 7 x Inception-B blocks for idx in range(7): block_scope = 'Mixed_6' + chr(ord('b') + idx) net = block_inception_b(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points # 17 x 17 x 1024 # Reduction-B block net = block_reduction_b(net, 'Mixed_7a') end_points["pool5"] = net if add_and_check_final('Mixed_7a', net): return net, end_points # 8 x 8 x 1536 # 3 x Inception-C blocks for idx in range(3): block_scope = 'Mixed_7' + chr(ord('b') + idx) net = block_inception_c(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint)
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder(tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
def model(self, images, grasp_params, num_classes=1, is_training=False, softmax=False, restore=True, grasp_param_names=None, goal_spatial_fn=None, goal_vector_fn=None, scope=None, reuse=None, **kwargs): """Creates a tensorflow graph for this model. Args: images: A list of 4D tensors containing image data grasp_params: A 3D tensor of batch_size x action_batch_size x PARAMS_SIZE containing grasp params or a 2D tensor of batch_size x PARAMS_SIZE if action_batch_size is not None. num_classes: Number of classes to predict in the final layer is_training: If the model is in training or not softmax: If true the final layer is a softmax, logistic otherwise restore: To restore logit weights or not when initializing from a checkpoint grasp_param_names: A dictionary that maps sub-blocks of `grasp_params`to names (string). If not None, the naming is used in graph construction. A key `block_name` and value (`offset`, `size`,) assign a name to a block `grasp_params[:, offset:(offset + size)]`. goal_spatial_fn: Optional function, returns a 3-D tensor to merge into the features, for instance conditioning the Q function on some goal feature map. goal_vector_fn: Optional function, returns a 1-D vector to merge into features, conditioning Q function on some goal embedding. scope: The top-level scope of the tensorflow graph. reuse: True, None, or tf.AUTO_REUSE; if True, we go into reuse mode for this scope as well as all sub-scopes; if tf.AUTO_REUSE, we create variables if they do not exist, and return them otherwise; if None, we inherit the parent scope's reuse flag. **kwargs: Model-specific arguments. Returns: graph: A tensorflow graph for the model Raises: ValueError: if restore=False as it is currently not supported """ del kwargs if not restore: raise ValueError("This model doesn't yet support restore=False") batch_norm_var_collection = 'moving_vars' batch_norm = { # Decay for the moving averages. 'decay': self._batch_norm_decay, # epsilon to prevent 0s in variance. 'epsilon': self._batch_norm_epsilon, # collection containing the moving mean and moving variance. 'variables_collections': { 'beta': None, 'gamma': None, 'moving_mean': [batch_norm_var_collection], 'moving_variance': [batch_norm_var_collection], }, # Whether to scale after normalization. 'scale': True, } end_points = {} tile_batch = (len(grasp_params.shape) == 3) if tile_batch: def expand_to_megabatch(feature): # Collapse second dimension of megabatch. dim = tf.shape(feature)[2] return tf.reshape(feature, [-1, dim]) grasp_params = contrib_framework.nest.map_structure( expand_to_megabatch, grasp_params) # Note that we need to do this before calling the tf.variable_scope # since there seems to be a bug in TF that reuse=True does not work with # scope=None even if the default_name is passed. # TODO(T2R_CONTRIBUTORS): Remove this None check and pass in the class name as # the default_name in the tf.variable_scope initialization. def _run(): """Forward pass through the network.""" with slim.arg_scope([slim.dropout], is_training=is_training): with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer(stddev=0.01), weights_regularizer=slim.l2_regularizer(self._l2_regularization), activation_fn=tf.nn.relu, trainable=is_training): with slim.arg_scope( [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'): with slim.arg_scope( [slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm): _, grasp_image = images net = slim.conv2d( grasp_image, 64, [6, 6], stride=2, scope='conv1_1', activation_fn=None, normalizer_fn=None, normalizer_params=None) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. net = tf.nn.relu(slim.batch_norm(net, scale=False)) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1') self.activation_layers.append(net) for l in range(2, 2 + self.num_convs[0]): net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l) self.activation_layers.append(net) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2') end_points['pool2'] = net self.activation_layers.append(net) logging.debug('pool2') logging.debug(net.get_shape()) if grasp_param_names is None: grasp_param_blocks = [grasp_params] grasp_param_block_names = ['fcgrasp'] else: grasp_param_blocks = [] grasp_param_block_names = [] # Note: Creating variables must happen in a deterministic # order, otherwise some workers will look for variables on the # wrong parameter servers, so we sort the grasp_param_names # here. for block_name in sorted(grasp_param_names): offset, size = grasp_param_names[block_name] grasp_param_blocks += [ tf.slice(grasp_params, [0, offset], [-1, size]) ] grasp_param_block_names += [block_name] grasp_param_tensors = [] for block, name in zip(grasp_param_blocks, grasp_param_block_names): grasp_param_tensors += [ slim.fully_connected( block, 256, scope=name, activation_fn=None, normalizer_fn=None, normalizer_params=None) ] fcgrasp = tf.add_n(grasp_param_tensors) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False)) fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2') context = tf.reshape(fcgrasp, [-1, 1, 1, 64]) end_points['fcgrasp'] = fcgrasp # Tile the image embedding action_batch_size times to align # with the expanded action dimension of action_batch_size. # Same image is used with all the actions in a action_batch. # net pre expansion should be [batch, *, *, *] # net post expansion should be [batch x action_batch, *, *, *] if tile_batch: net = contrib_seq2seq.tile_batch(net, self._action_batch_size) net = tf.add(net, context) logging.debug('net post add %s', net) end_points['vsum'] = net self.activation_layers.append(net) logging.debug('vsum') logging.debug(net.get_shape()) for l in range(2 + sum(self.num_convs[:1]), 2 + sum(self.num_convs[:2])): net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l) logging.debug('conv%d', l) self.activation_layers.append(net) logging.debug(net.get_shape()) net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3') logging.debug('pool3') logging.debug(net.get_shape()) self.activation_layers.append(net) for l in range(2 + sum(self.num_convs[:2]), 2 + sum(self.num_convs[:3])): net = slim.conv2d( net, 64, [3, 3], scope='conv%d' % l, padding='VALID') self.activation_layers.append(net) logging.debug('final conv') logging.debug(net.get_shape()) end_points['final_conv'] = net batch_size = tf.shape(net)[0] if goal_spatial_fn is not None: goal_spatial = goal_spatial_fn() # Tile goal to match net batch size (e.g. CEM). goal_batch_size = tf.shape(goal_spatial)[0] goal_spatial = tf.tile( goal_spatial, [batch_size//goal_batch_size, 1, 1, 1]) # Merging features in style of Fang 2017. net = tf.concat([net, goal_spatial], axis=3) net = slim.flatten(net, scope='flatten') if goal_vector_fn is not None: goal_vector = goal_vector_fn() goal_batch_size = tf.shape(goal_vector)[0] goal_vector = tf.tile( goal_vector, [batch_size//goal_batch_size, 1]) net = tf.concat([net, goal_vector], axis=1) for l in range(self.hid_layers): net = slim.fully_connected(net, 64, scope='fc%d' % l) name = 'logit' if num_classes > 1: name = 'logit_%d' % num_classes logits = slim.fully_connected( net, num_classes, activation_fn=None, scope=name, normalizer_fn=None, normalizer_params=None) end_points['logits'] = logits if softmax: predictions = tf.nn.softmax(logits) else: predictions = tf.nn.sigmoid(logits) if tile_batch: if num_classes > 1: predictions = tf.reshape( predictions, [-1, self._action_batch_size, num_classes]) else: predictions = tf.reshape(predictions, [-1, self._action_batch_size]) end_points['predictions'] = predictions return logits, end_points if self._create_var_scope: if scope is None: scope = self.__class__.__name__ with tf.variable_scope(scope, values=[images], reuse=reuse): with slim.arg_scope([slim.batch_norm], is_training=is_training, decay=batch_norm['decay'], epsilon=batch_norm['epsilon'], scale=batch_norm['scale']): logits, end_points = _run() else: with slim.arg_scope([slim.batch_norm], is_training=is_training, decay=batch_norm['decay'], epsilon=batch_norm['epsilon'], scale=batch_norm['scale'], updates_collections=None): logits, end_points = _run() return logits, end_points