def test_aliases(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a2', t2) self.assertEqual(t1.aliases, ['a1']) self.assertEqual(t2.aliases, ['a2'])
def test_gather_aliases(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') t3 = constant_op.constant(2.0, name='t3') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a2', t2) ops.add_to_collection('end_points', t3) aliases = utils.gather_tensors_aliases(ops.get_collection('end_points')) self.assertEqual(aliases, ['a1', 'a2', 't3'])
def mlp(feature, hparams, name="mlp"): """Multi layer perceptron with dropout and relu activation.""" with tf.variable_scope(name, "mlp", values=[feature]): num_mlp_layers = hparams.num_mlp_layers mlp_size = hparams.mlp_size for _ in range(num_mlp_layers): feature = common_layers.dense(feature, mlp_size, activation=None) utils.collect_named_outputs("norms", "mlp_feature", tf.norm(feature, axis=-1)) feature = common_layers.layer_norm(feature) feature = tf.nn.relu(feature) feature = tf.nn.dropout(feature, keep_prob=1.-hparams.dropout) return feature
def flatten(inputs, outputs_collections=None, scope=None): """Flattens the input while maintaining the batch_size. Assumes that the first dimension represents the batch. Args: inputs: a tensor of size [batch_size, ...]. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a flattened tensor with shape [batch_size, k]. Raises: ValueError: if inputs.shape is wrong. """ with ops.op_scope([inputs], scope, 'Flatten') as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if (inputs_rank is None) or (inputs_rank < 2): raise ValueError('Inputs must have a least 2 dimensions.') dims = inputs_shape[1:] if not dims.is_fully_defined(): raise ValueError('Inputs 2nd dimension must be defined.') k = dims.num_elements() outputs = array_ops.reshape(inputs, [-1, k]) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def one_hot_encoding(labels, num_classes, on_value=1.0, off_value=0.0, outputs_collections=None, scope=None): """Transform numeric labels into onehot_labels using tf.one_hot. Args: labels: [batch_size] target labels. num_classes: total number of classes. on_value: A scalar defining the on-value. off_value: A scalar defining the off-value. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: one hot encoding of the labels. """ with ops.op_scope([labels, num_classes], scope, 'OneHotEncoding') as sc: if labels.dtype == dtypes.int32: labels = standard_ops.to_int64(labels) outputs = standard_ops.one_hot(labels, num_classes, on_value=on_value, off_value=off_value) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def bottleneck_hole(inputs, depth, depth_bottleneck, stride, rate=2, outputs_collections=None, scope=None): with variable_scope.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers.conv2d( inputs, depth, [1, 1], stride=stride, activation_fn=None, scope='shortcut') residual = layers.conv2d( inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = layers_lib.conv2d(residual, depth_bottleneck, [3, 3], stride=1, rate=rate, padding='SAME', scope='conv2') residual = layers.conv2d( residual, depth, [1, 1], stride=1, activation_fn=None, scope='conv3') output = nn_ops.relu(shortcut + residual) return utils.collect_named_outputs(outputs_collections, sc.name, output)
def dropout(inputs, keep_prob=0.5, noise_shape=None, is_training=True, outputs_collections=None, scope=None): """Returns a dropout op applied to the input. With probability `keep_prob`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`. The scaling is so that the expected sum is unchanged. Args: inputs: the tensor to pass to the nn.dropout op. keep_prob: A scalar `Tensor` with the same type as x. The probability that each element is kept. noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for randomly generated keep/drop flags. is_training: A bool `Tensor` indicating whether or not the model is in training mode. If so, dropout is applied and values scaled. Otherwise, inputs is returned. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a tensor representing the output of the operation. """ with ops.op_scope([inputs], scope, 'Dropout') as sc: is_training = ops.convert_to_tensor(is_training) outputs = control_flow_ops.cond( is_training, lambda: nn.dropout(inputs, keep_prob, noise_shape), lambda: inputs) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def avg_pool2d(inputs, kernel_size, stride=2, padding='VALID', outputs_collections=None, scope=None): """Adds a Avg Pooling op. It is assumed by the wrapper that the pooling is only done per image and not in depth or batch. Args: inputs: a tensor of size [batch_size, height, width, depth]. kernel_size: a list of length 2: [kernel_height, kernel_width] of the pooling kernel over which the op is computed. Can be an int if both values are the same. stride: a list of length 2: [stride_height, stride_width]. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: the padding method, either 'VALID' or 'SAME'. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a tensor representing the results of the pooling operation. """ with ops.op_scope([inputs], scope, 'AvgPool2D') as sc: kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) outputs = nn.avg_pool(inputs, ksize=[1, kernel_h, kernel_w, 1], strides=[1, stride_h, stride_w, 1], padding=padding) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def test_multiple_aliases(self): t1 = tf.constant(1.0, name='t1') t2 = tf.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a11', t1) utils.collect_named_outputs('end_points', 'a12', t1) utils.collect_named_outputs('end_points', 'a21', t2) utils.collect_named_outputs('end_points', 'a22', t2) self.assertEqual(t1.aliases, ['a11', 'a12']) self.assertEqual(t2.aliases, ['a21', 'a22'])
def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): """Bottleneck residual unit variant with BN before convolutions. This is the full preactivation residual unit variant proposed in [2]. See Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck variant which has an extra bottleneck layer. When putting together two consecutive ResNet blocks that use this unit, one should use stride = 2 in the last unit of the first block. Args: inputs: A tensor of size [batch, height, width, channels]. depth: The depth of the ResNet unit output. depth_bottleneck: The depth of the bottleneck layers. stride: The ResNet unit's stride. Determines the amount of downsampling of the units output compared to its input. rate: An integer, rate for atrous convolution. outputs_collections: Collection to add the ResNet unit output. scope: Optional variable_scope. Returns: The ResNet unit's output. """ with variable_scope.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) preact = layers.batch_norm( inputs, activation_fn=nn_ops.relu, scope='preact') if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers_lib.conv2d( preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') residual = layers_lib.conv2d( preact, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = resnet_utils.conv2d_same( residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') residual = layers_lib.conv2d( residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, scope='conv3') output = shortcut + residual return utils.collect_named_outputs(outputs_collections, sc.name, output)
def image_encoder(image_feat, hparams, name="image_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = image_feat image_hidden_size = hparams.image_hidden_size or hparams.hidden_size image_filter_size = hparams.image_filter_size or hparams.filter_size with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, None, hparams.attention_key_channels or image_hidden_size, hparams.attention_value_channels or image_hidden_size, image_hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.image_self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, scale_dotproduct=hparams.scale_dotproduct, ) utils.collect_named_outputs( "norms", "image_feat_self_attention_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_self_attention_postprocess_%d"%(layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), image_filter_size, image_hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs( "norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_ffn_postprocess_%d"%(layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout( encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs( "norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout( query, keep_prob=1.-hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def l2_normalization( inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Implement L2 normalization on every feature (i.e. spatial normalization). Should be extended in some near future to other dimensions, providing a more flexible normalization framework. inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. scaling: whether or not to add a post scaling operation along the dimensions which have been normalized. scale_initializer: An initializer for the weights. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. """ with variable_scope.variable_scope( scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims params_shape = inputs_shape[-1:] dtype = inputs.dtype.base_dtype # Normalize along spatial dimensions. norm_dim = tf.range(1, inputs_rank-1) outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def test_convert_collection_to_dict_clear_collection(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a21', t2) utils.collect_named_outputs('end_points', 'a22', t2) utils.convert_collection_to_dict('end_points', clear_collection=True) self.assertEqual(ops.get_collection('end_points'), [])
def bias_add(inputs, activation_fn=None, initializer=init_ops.zeros_initializer, regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a bias to the inputs. Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor of with at least rank 2 and value for the last dimension, e.g. `[batch_size, depth]`, `[None, None, None, depth]`. activation_fn: Optional activation function. initializer: An initializer for the bias, defaults to 0. regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: a tensor representing the result of adding biases to the inputs. """ with variable_scope.variable_op_scope([inputs], scope, 'BiasAdd', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype num_features = utils.last_dimension(inputs.get_shape(), min_rank=2) biases_collections = utils.get_variable_collections(variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_features,], dtype=dtype, initializer=initializer, regularizer=regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(inputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def test_convert_collection_to_dict(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a21', t2) utils.collect_named_outputs('end_points', 'a22', t2) end_points = utils.convert_collection_to_dict('end_points') self.assertEqual(end_points['a1'], t1) self.assertEqual(end_points['a21'], t2) self.assertEqual(end_points['a22'], t2)
def l2_normalization( inputs, scaling=False, #Scaling after normalization scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, data_format='NHWC', trainable=True, scope=None): with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() #[N, H, W, C] inputs_rank = inputs_shape.ndims #dimension 4 dtype = inputs.dtype.base_dtype if data_format == 'NHWC': norm_dim = tf.range(inputs_rank - 1, inputs_rank) #Choose dimension 'C' from 'NHWC' params_shape = inputs_shape[-1:] #How many channels elif data_format == 'NCHW': norm_dim = tf.range(1, 2) params_shape = (inputs_shape[1]) outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) #Normalizing if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) if data_format == 'NHWC': outputs = tf.multiply(outputs, scale) elif data_format == 'NCHW': scale = tf.expand_dims(scale, axis=-1) scale = tf.expand_dims(scale, axis=-1) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def naive(inputs, depth, stride, rate=1, outputs_collections=None, scope=None): """naive residual unit. This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for its definition. Note that we use here the bottleneck variant which has an extra bottleneck layer. When putting together two consecutive ResNet blocks that use this unit, one should use stride = 2 in the last unit of the first block. Args: inputs: A tensor of size [batch, height, width, channels]. depth: The depth of the ResNet unit output. depth_bottleneck: The depth of the bottleneck layers. stride: The ResNet unit's stride. Determines the amount of downsampling of the units output compared to its input. rate: An integer, rate for atrous convolution. outputs_collections: Collection to add the ResNet unit output. scope: Optional variable_scope. Returns: The ResNet unit's output. """ with variable_scope.variable_scope(scope, 'naive_v1', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers.conv2d(inputs, depth, [1, 1], stride=stride, activation_fn=None, scope='shortcut') residual = resnet_utils.conv2d_same(inputs, depth, 3, stride, rate=rate, scope='conv1') residual = layers.conv2d(residual, depth, [3, 3], stride=1, activation_fn=None, scope='conv2') output = nn_ops.relu(shortcut + residual) return utils.collect_named_outputs(outputs_collections, sc.name, output)
def bias_add(inputs, activation_fn=None, initializer=init_ops.zeros_initializer, regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a bias to the inputs. Can be used as a normalizer function for conv2d and fully_connected. Args: inputs: a tensor of with at least rank 2 and value for the last dimension, e.g. `[batch_size, depth]`, `[None, None, None, depth]`. activation_fn: Optional activation function. initializer: An initializer for the bias, defaults to 0. regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional collections for the variables. outputs_collections: collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: a tensor representing the result of adding biases to the inputs. """ with variable_scope.variable_op_scope([inputs], scope, 'BiasAdd', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype num_features = utils.last_dimension(inputs.get_shape(), min_rank=2) biases_collections = utils.get_variable_collections(variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_features,], dtype=dtype, initializer=initializer, regularizer=regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(inputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def spatial_normalization(self, inputs): with variable_scope.variable_scope(None, 'L2Normalization', [inputs], reuse=None) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims norm_dim = tf.range(inputs_rank-1, inputs_rank) params_shape = inputs_shape[-1:] # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. scale_collections = utils.get_variable_collections(None, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=inputs.dtype.base_dtype, initializer=init_ops.ones_initializer(), collections=scale_collections, trainable=True) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(None, sc.original_name_scope, outputs)
def bottleneck(inputs, depth, depth_bottleneck, stride, outputs_collections=None, scope=None): with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: # 获取输入的最后一个维度,输出通道数 depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) # 对输入进行batch_borm,接着用relu进行预激活 preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') if depth == depth_in: # 如果残差单元输入通道和输出通道数一样,就对inputs进行降采样 shortcut = subsample(inputs, stride, 'shortcut') else: # 如果残差单元输入通道与输出通道数不一样,就使用stride步长的1*1卷积改变其通道数,是的输入通道和输出通道数一样 shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') # 定义残差 # 第一步:1*1,stride=1,输出通道数为depth_bottleneck的卷积 # 第二步:3*3,stride=stride,输出通道数为depth_bottleneck的卷积 # 第三步:1*1,stride=1,输出通道数为depth的卷积 residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = slim.conv2d(residual, depth_bottleneck, [3, 3], stride=stride, scope='conv2') residual = slim.conv2d(residual, depth, [1, 1], stride=1, scope='conv3') output = shortcut + residual # 将结果添加到outputs_collections return utils.collect_named_outputs(outputs_collections, sc.name, output)
def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): with variable_scope.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) preact = layers.batch_norm( inputs, activation_fn=nn_ops.relu, scope='preact') if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers_lib.conv2d( preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') residual = preact residual = tf.layers.batch_normalization(residual) residual = tf.nn.relu(residual) residual = layers_lib.conv2d( residual, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = tf.layers.batch_normalization(residual) residual = tf.nn.relu(residual) residual = resnet_utils.conv2d_same( residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') residual = tf.layers.batch_normalization(residual) residual = tf.nn.relu(residual) residual = layers_lib.conv2d( residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, scope='conv3') output = shortcut + residual return utils.collect_named_outputs(outputs_collections, sc.name, output)
def preact_conv2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a 2D convolution preceded by batch normalization and activation. """ with variable_scope.variable_op_scope([inputs], scope, 'Conv', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype if normalizer_fn: normalizer_params = normalizer_params or {} inputs = normalizer_fn(inputs, activation_fn=activation_fn, **normalizer_params) kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) outputs = nn.conv2d(inputs, weights, [1, stride_h, stride_w, 1], padding=padding) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def avg_pool2d(inputs, kernel_size, stride=2, padding='VALID', data_format=DATA_FORMAT_NHWC, outputs_collections=None, scope=None, quantizer=None): """Adds a 2D average pooling op. It is assumed that the pooling is done per image but not in batch or channels. Args: inputs: A 4-D tensor of shape `[batch_size, height, width, channels]` if `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if `data_format` is `NCHW`. kernel_size: A list of length 2: [kernel_height, kernel_width] of the pooling kernel over which the op is computed. Can be an int if both values are the same. stride: A list of length 2: [stride_height, stride_width]. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: The padding method, either 'VALID' or 'SAME'. data_format: A string. `NHWC` (default) and `NCHW` are supported. outputs_collections: The collections to which the outputs are added. scope: Optional scope for name_scope. Returns: A `Tensor` representing the results of the pooling operation. Raises: ValueError: If `data_format` is neither `NHWC` nor `NCHW`. """ if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') with ops.name_scope(scope, 'AvgPool2D', [inputs]) as sc: inputs = ops.convert_to_tensor(inputs) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = AveragePooling2D(pool_size=kernel_size, strides=stride, padding=padding, data_format=df, _scope=sc, quantizer=quantizer) outputs = layer.apply(inputs) return slim_utils.collect_named_outputs(outputs_collections, sc, outputs)
def bottleneck( inputs, depth, expansion, stride, outputs_collections=None, scope=None): """ :param inputs: :param depth: :param expansion: :param stride: :param outputs_collections: :param scope: :return output: """ with tf.variable_scope(scope, 'bottleneck', [inputs]) as sc: residual = inputs depth_in = utils.last_dimension( inputs.get_shape(), min_rank=4) output = slim.conv2d( inputs, expansion*depth_in, 1, 1, scope='conv1x1_1') output = slim.batch_norm( output, scope='conv1x1_1_bn') output = tf.nn.relu6(output) output = slim.separable_conv2d( output, depth, 3, 1, stride, scope='separable_conv3x3') output = slim.batch_norm( output, scope='separable_conv3x3_bn') output = tf.nn.relu6(output) output = slim.conv2d( output, depth, 1, 1, scope='conv1x1_2') output = slim.batch_norm(output, scope='conv1x1_2_bn') if stride == 1: output += residual return utils.collect_named_outputs(outputs_collections, sc.name, output)
def max_pool2d(x, window, stride=1, padding='SAME', name='MaxPool'): input_rank = x.get_shape().ndims if input_rank is None: raise ValueError('Rank of inputs must be known') if input_rank < 3: raise ValueError('Rank of inputs is %d, which is < 3' % input_rank) if input_rank == 3: x = tf.expand_dims(x, 3) window = utils.n_positive_integers(2, window) if len(window) < input_rank - 2: window = (1, ) * (input_rank - len(window) - 2) + window stride = utils.n_positive_integers(2, stride) if len(stride) < input_rank - 2: stride = (1, ) * (input_rank - len(stride) - 2) + stride out = tf.nn.pool(x, window, 'MAX', padding, strides=stride, name=name) return utils.collect_named_outputs(tf.GraphKeys.ACTIVATIONS, name, out)
def stack_blocks_dense(self, net, blocks, output_stride=None, outputs_collections=None): current_stride = 1 rate = 1 for block in blocks: with variable_scope.variable_scope(block.scope, 'block', values=[net]) as sc: for i, unit in enumerate(block.args): if output_stride is not None and current_stride > output_stride: raise ValueError( 'The target output_stride cannot be reached.') # with variable_scope.variable_scope('unit_%d' % (i+1), values=[net]): unit_depth, unit_depth_bottleneck, unit_stride = unit if output_stride is not None and current_stride == output_stride: net = block.unit_fn( net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=1, rate=rate, name='unit_%d' % (i + 1)) rate *= unit_stride else: net = block.unit_fn( net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=unit_stride, rate=1, name='unit_%d' % (i + 1)) current_stride *= unit_stride net = utils.collect_named_outputs(outputs_collections, sc.name, net) if output_stride is not None and current_stride != output_stride: raise ValueError('The target output_stride cannot be reached.') return net
def dropout(inputs, keep_prob=0.5, noise_shape=None, is_training=True, outputs_collections=None, scope=None): """Returns a dropout op applied to the input. With probability `keep_prob`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`. The scaling is so that the expected sum is unchanged. Args: inputs: the tensor to pass to the nn.dropout op. keep_prob: A scalar `Tensor` with the same type as x. The probability that each element is kept. noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for randomly generated keep/drop flags. is_training: A bool `Tensor` indicating whether or not the model is in training mode. If so, dropout is applied and values scaled. Otherwise, inputs is returned. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a tensor representing the output of the operation. """ with ops.op_scope([inputs], scope, 'Dropout') as sc: inputs = ops.convert_to_tensor(inputs) is_training_value = utils.constant_value(is_training, dtypes.bool) if is_training_value is not None: if is_training_value: outputs = nn.dropout(inputs, keep_prob, noise_shape) else: outputs = inputs else: def _dropout(): return nn.dropout(inputs, keep_prob, noise_shape) outputs = control_flow_ops.cond(is_training, _dropout, lambda: inputs) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.hidden_size image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout( encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout( query, keep_prob=1.-hp.layer_prepostprocess_dropout) decoder_output = iterative_encoder_decoder( encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias, query, hp) utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def bottleneck(inputs, depth, depth_bottleneck, stride, outputs_collections=None, scope=None): with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: # resnet_v2 depin_in = utils.last_dimension(inputs.get_shape(), min_rank=4) preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') if depth == depin_in: # depth是第三层的输出通道数 shortcut = subsample(inputs, stride, 'shortcut') else: shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = conv2d_same(residual, depth_bottleneck, 3, stride, scope='conv2') residual = slim.conv2d(residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, scope='conv3') output = residual + shortcut return utils.collect_named_outputs(outputs_collections, sc.name, output)
def stack_blocks( net, blocks, outputs_collections=None): """ :param net: :param blocks: :param outputs_collections: :return net: """ for block in blocks: with tf.variable_scope(block.scope, 'block', [net]) as sc: for i, unit in enumerate(block.args): with tf.variable_scope('unit_%d' % (i+1), values=[net]): unit_depth, unit_expansion, unit_stride = unit net = block.unit_fn( net, depth=unit_depth, expansion=unit_expansion, stride=unit_stride) net = utils.collect_named_outputs(outputs_collections, sc.name, net) return net
def stack_blocks_dense(self, net, blocks, outputs_collections=None): # current_stride保持有效的步长目前激活的,当步长达到output_stride可以使用空洞卷积替代卷积 current_stride = 1 # The atrous convolution rate parameter. rate = 1 for block in blocks: with tf.variable_scope(block.scope, 'block', [net]) as sc: for i, unit in enumerate(block.args): if self.output_stride is not None and current_stride > self.output_stride: raise ValueError( 'The target output_stride cannot be reached.') with tf.variable_scope('unit_%d' % (i + 1), values=[net]): unit_depth, unit_depth_bottleneck, unit_stride = unit # 如果我们达到了output_stride目标,我们需要使用空洞卷积,stride=1在接下来的层中, rate=当前步长*rate在接下来的层中 if self.output_stride is not None and current_stride == self.output_stride: net = block.unit_fn( net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=1, rate=rate) rate *= unit_stride else: net = block.unit_fn( net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=unit_stride, rate=1) current_stride *= unit_stride print(sc.name, net.shape) # 将当前输出输出加入到集合中,然后给当前集合别名 net = utils.collect_named_outputs(outputs_collections, sc.name, net) if self.output_stride is not None and current_stride != self.output_stride: raise ValueError('The target output_stride cannot be reached.') return net
def max_pool1d(inputs, kernel_size, stride=2, padding='VALID', data_format=DATA_FORMAT_NHWC, outputs_collections=None, scope=None): """Adds a 1D Max Pooling op.""" if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') with ops.name_scope(scope, 'MaxPool1D', [inputs]) as sc: inputs = ops.convert_to_tensor(inputs) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = pooling_layers.MaxPooling1D(pool_size=kernel_size, strides=stride, padding=padding, data_format=df, _scope=sc) outputs = layer.apply(inputs) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def flatten(inputs, outputs_collections=None, scope=None): """Flattens the input while maintaining the batch_size. Assumes that the first dimension represents the batch. Args: inputs: a tensor of size [batch_size, ...]. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a flattened tensor with shape [batch_size, k]. Raises: ValueError: if inputs.shape is wrong. """ if len(inputs.get_shape()) < 2: raise ValueError('Inputs must be have a least 2 dimensions') dims = inputs.get_shape()[1:] k = dims.num_elements() with ops.op_scope([inputs], scope, 'Flatten') as sc: outputs = array_ops.reshape(inputs, [-1, k]) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def resnet_block_fn(block, current_stride, net, output_stride, outputs_collections, rate, store_non_strided_activations, scope_suffix): with variable_scope.variable_scope(block.scope, f'block{scope_suffix}', [net]) as sc: block_stride = 1 for i, unit in enumerate(block.args): if store_non_strided_activations and i == len(block.args) - 1: # Move stride from the block's last unit to the end of the block. block_stride = unit.get('stride', 1) unit = dict(unit, stride=1) with variable_scope.variable_scope('unit_%d' % (i + 1), values=[net]): # If we have reached the target output_stride, then we need to employ # atrous convolution with stride=1 and multiply the atrous rate by the # current unit's stride for use in subsequent layers. if output_stride is not None and current_stride == output_stride: net = block.unit_fn(net, rate=rate, **dict(unit, stride=1)) rate *= unit.get('stride', 1) else: net = block.unit_fn(net, rate=1, **unit) current_stride *= unit.get('stride', 1) if output_stride is not None and current_stride > output_stride: raise ValueError( 'The target output_stride cannot be reached.') # Collect activations at the block's end before performing subsampling. net = utils.collect_named_outputs(outputs_collections, sc.name, net) # Subsampling of the block's output activations. if output_stride is not None and current_stride == output_stride: rate *= block_stride else: net = subsample(net, block_stride) current_stride *= block_stride if output_stride is not None and current_stride > output_stride: raise ValueError('The target output_stride cannot be reached.') return current_stride, net, rate
def bottleneck(self, inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) # 如果不能在深度上变化,就在宽度上变化 if depth == depth_in: shortcut = self.subsample(inputs, stride, 'shortcut') else: shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride, activation_fn=None, scope='shortcut') # 3x128x128x256 residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') # 3x128x128x64 residual = self.conv2d_same(residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') # 3x128x128x64 residual = slim.conv2d(residual, depth, [1, 1], activation_fn=None, stride=1, scope='conv3') # 3x128x128x256 output = tf.nn.relu(shortcut + residual) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, output)
def Global_Average_Pooling(inputs, axis=[1, 2], outputs_collections=None, scope=None): """ Global Average Pooling. Input: 4-D Tensor [batch, height, width, in_channels]. Output: 2-D Tensor [batch, pooled dim] Arguments: inputs: `Tensor`. Incoming 4-D Tensor. outputs_collections: The collections to which the outputs are added. scope: Optional scope for name_scope. """ input_shape = get_incoming_shape(inputs) assert len(input_shape) == 4, "Incoming Tensor shape must be 4-D" with tf.name_scope(scope, 'GlobalAvgPool', [inputs]) as sc: outputs = tf.reduce_mean(inputs, axis=axis) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def bottleneck_hole(inputs, depth, depth_bottleneck, stride, rate=2, outputs_collections=None, scope=None): with variable_scope.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers.conv2d(inputs, depth, [1, 1], stride=stride, activation_fn=None, scope='shortcut') residual = layers.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = layers_lib.conv2d(residual, depth_bottleneck, [3, 3], stride=1, rate=rate, padding='SAME', scope='conv2') residual = layers.conv2d(residual, depth, [1, 1], stride=1, activation_fn=None, scope='conv3') output = nn_ops.relu(shortcut + residual) return utils.collect_named_outputs(outputs_collections, sc.name, output)
def stack_blocks_dense(net, blocks, outputs_collections=None): """ Args: net: A Tensor of size [batch, height, width, channels].输入 blocks: 是之前定义的Block的class的列表。 outputs_collections: 收集各个end_points的collections Returns: net: Output tensor """ # 循环Block类对象的列表blocks,即逐个Residual Unit地堆叠 for block in blocks: with tf.variable_scope(block.scope, 'block', [net]) as sc: for i, unit in enumerate(block.args): with tf.variable_scope('unit_%d' % (i + 1), values=[net]): unit_depth, unit_depth_bottleneck, unit_stride = unit net = block.unit_fn(net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=unit_stride) net = utils.collect_named_outputs(outputs_collections, sc.name, net) return net
def l2_normalization(inputs, scaling=False, scale_initializer=init_ops.ones_initializer(), reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """ conv4_3需要先进行l2正则,以减小该层和后面的误差 """ with variable_scope.variable_scope(scope, 'L2Normalization', [inputs], reuse=reuse) as sc: inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims dtype = inputs.dtype.base_dtype norm_dim = tf.range(inputs_rank - 1, inputs_rank) params_shape = inputs_shape[-1:] # Normalize along spatial dimensions. outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) # Additional scaling. if scaling: scale_collections = utils.get_variable_collections( variables_collections, 'scale') scale = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=scale_initializer, collections=scale_collections, trainable=trainable) outputs = tf.multiply(outputs, scale) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def max_pool2d(inputs, kernel_size, stride=2, padding='VALID', outputs_collections=None, scope=None): """Adds a Max Pooling op. It is assumed by the wrapper that the pooling is only done per image and not in depth or batch. Args: inputs: a tensor of size [batch_size, height, width, depth]. kernel_size: a list of length 2: [kernel_height, kernel_width] of the pooling kernel over which the op is computed. Can be an int if both values are the same. stride: a list of length 2: [stride_height, stride_width]. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: the padding method, either 'VALID' or 'SAME'. outputs_collections: collection to add the outputs. scope: Optional scope for op_scope. Returns: a tensor representing the results of the pooling operation. Raises: ValueError: if 'kernel_size' is not a 2-D list """ with ops.op_scope([inputs], scope, 'MaxPool2D') as sc: inputs = ops.convert_to_tensor(inputs) kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) outputs = nn.max_pool(inputs, ksize=[1, kernel_h, kernel_w, 1], strides=[1, stride_h, stride_w, 1], padding=padding) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def squeezenet(images, is_training=True, batch_norm_decay=0.999, num_classes=1000): """Original squeezenet architecture for 224x224 images.""" with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)): with tf.variable_scope('squeezenet', values=[images]) as sc: end_point_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d], outputs_collections=[end_point_collection]): net = slim.conv2d(images, 96, [7, 7], stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool1') net = fire_module(net, 16, 64, scope='fire2') net = fire_module(net, 16, 64, scope='fire3') net = fire_module(net, 32, 128, scope='fire4') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool4') net = fire_module(net, 32, 128, scope='fire5') net = fire_module(net, 48, 192, scope='fire6') net = fire_module(net, 48, 192, scope='fire7') net = fire_module(net, 64, 256, scope='fire8') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool8') net = fire_module(net, 64, 256, scope='fire9') net = slim.dropout(net, is_training=is_training, scope='drop9') net = slim.conv2d(net, num_classes, [1, 1], stride=1, scope='conv10') net = slim.avg_pool2d(net, [13, 13], stride=1, scope='avgpool10') logits = tf.squeeze(net, [1, 2], name='logits') logits = utils.collect_named_outputs(end_point_collection, sc.name + '/logits', logits) end_points = utils.convert_collection_to_dict(end_point_collection) return logits, end_points
def maxout(inputs, num_units, axis=None, outputs_collections=None, scope=None): """Adds a maxout op which is a max pooling performed in filter/channel dimension. This can also be used after fully-connected layers to reduce number of features. Args: inputs: A Tensor on which maxout will be performed num_units: Specifies how many features will remain after max pooling at the channel dimension. This must be multiple of number of channels. axis: The dimension where max pooling will be performed. Default is the last dimension. outputs_collections: The collections to which the outputs are added. scope: Optional scope for name_scope. Returns: A `Tensor` representing the results of the pooling operation. Raises: ValueError: if num_units is not multiple of number of features. """ with ops.name_scope(scope, 'MaxOut', [inputs]) as sc: inputs = ops.convert_to_tensor(inputs) shape = inputs.get_shape().as_list() if axis is None: # Assume that channel is the last dimension axis = -1 num_channels = shape[axis] if num_channels % num_units: raise ValueError('number of features({}) is not ' 'a multiple of num_units({})' .format(num_channels, num_units)) shape[axis] = -1 shape += [num_channels // num_units] outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False) return utils.collect_named_outputs(outputs_collections, sc, outputs)
def cifar_squeezenet(images, is_training=True, batch_norm_decay=0.999, num_classes=10): """Modified version of squeezenet for CIFAR images""" with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)): with tf.variable_scope('squeezenet', values=[images]) as sc: end_point_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d], outputs_collections=[end_point_collection]): net = slim.conv2d(images, 96, [2, 2], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='maxpool1') net = fire_module(net, 16, 64, scope='fire2') net = fire_module(net, 16, 64, scope='fire3') net = fire_module(net, 32, 128, scope='fire4') net = slim.max_pool2d(net, [2, 2], scope='maxpool4') net = fire_module(net, 32, 128, scope='fire5') net = fire_module(net, 48, 192, scope='fire6') net = fire_module(net, 48, 192, scope='fire7') net = fire_module(net, 64, 256, scope='fire8') net = slim.max_pool2d(net, [2, 2], scope='maxpool8') net = fire_module(net, 64, 256, scope='fire9') # Use global average pooling per 'Network in Network [1]' net = slim.avg_pool2d(net, [4, 4], scope='avgpool10') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='conv10') logits = tf.squeeze(net, [1, 2], name='logits') logits = utils.collect_named_outputs(end_point_collection, sc.name + '/logits', logits) end_points = utils.convert_collection_to_dict(end_point_collection) return logits, end_points
def Squeeze_excitation_layer2(inputs, ratio, outputs_collections=None, scope=None): channal = utils.last_dimension(inputs.get_shape(), min_rank=4) num_outputs = channal // ratio with tf.name_scope(scope, 'SE_layer2', [inputs]) as sc: squeeze = slim.conv2d(inputs, num_outputs, [1, 1], stride=1, activation_fn=tf.nn.relu, normalizer_fn=None, scope='squeeze') excitation = slim.conv2d(squeeze, channal, [1, 1], stride=1, activation_fn=tf.nn.sigmoid, normalizer_fn=None, scope='excitation') scale = inputs * excitation return utils.collect_named_outputs(outputs_collections, sc, scale)
def stack_block_dense(net, blocks, outputs_collections=None): """ net:input blocks:Block的class列表 outputs_collections:收集各个end_points的collections """ for block in blocks: # 双层循环,遍历blocks,遍历res unit堆叠 with tf.variable_scope(block.scope, 'block', [net]) as sc: # 用两个tf.variable_scope将残差学习单元命名为block1/unit_1的形式 for i, unit in enumerate(block.args): with tf.variable_scope('unit_%d' % (i + 1), values=[net]): # 利用第二层循环拿到block中的args,将其展开为depth,depth_bottleneck,strdie unit_depth, unit_depth_bottleneck, unit_stride = unit # 使用残差学习单元的生成函数unit_fn,顺序的创建并连接所有的残差学习单元 net = block.unit_fn(net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=unit_stride) # 使用utils.collect_named_outputs将输出net添加到collection中 net = utils.collect_named_outputs(outputs_collections, sc.name, net) return net
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Args: -inputs: a tensor of size `[batch_size, height, width, channels]` or `[batch_size, channels]`. -decay: decay for the moving average. -center: If True, subtract `beta`. If False, `beta` is ignored. -scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. -epsilon: small float added to variance to avoid dividing by zero. -activation_fn: Optional activation function. -updates_collections: collections to collect the update ops for computation. If None, a control dependency would be added to make sure the updates are computed. -is_training: whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. -reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. -variables_collections: optional collections for the variables. -outputs_collections: collections to add the outputs. -trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). -scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs],scope, 'BatchNorm', reuse=reuse) as sc: inputs_shape = inputs.get_shape() dtype = inputs.dtype.base_dtype axis = list(range(len(inputs_shape) - 1)) params_shape = inputs_shape[-1:] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections(variables_collections,'beta') beta = variables.model_variable('beta',shape=params_shape,dtype=dtype,initializer=init_ops.zeros_initializer,collections=beta_collections,trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections,'gamma') gamma = variables.model_variable('gamma',shape=params_shape,dtype=dtype,initializer=init_ops.ones_initializer,collections=gamma_collections,trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections(variables_collections, 'moving_mean') moving_mean = variables.model_variable('moving_mean',shape=params_shape,dtype=dtype,initializer=init_ops.zeros_initializer,trainable=False,collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections(variables_collections, 'moving_variance') moving_variance = variables.model_variable('moving_variance',shape=params_shape,dtype=dtype,initializer=init_ops.ones_initializer,trainable=False,collections=moving_variance_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average(moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies([update_moving_mean,update_moving_variance]): outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) else: outputs = nn.batch_normalization( inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs.set_shape(inputs.get_shape()) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def fully_connected(inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a fully connected layer. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of with at least rank 2 and value for the last dimension, i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer, the number of output units in the layer. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_op_scope. Returns: the tensor variable representing the result of the series of operations. Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, int): raise ValueError('num_outputs should be integer, got %s.', num_outputs) with variable_scope.variable_op_scope([inputs], scope, 'fully_connected', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype num_input_units = utils.last_dimension(inputs.get_shape(), min_rank=2) static_shape = inputs.get_shape().as_list() static_shape[-1] = num_outputs out_shape = array_ops.unpack(array_ops.shape(inputs)) out_shape[-1] = num_outputs weights_shape = [num_input_units, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) if len(static_shape) > 2: # Reshape inputs inputs = array_ops.reshape(inputs, [-1, num_input_units]) outputs = standard_ops.matmul(inputs, weights) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if len(static_shape) > 2: # Reshape back outputs outputs = array_ops.reshape(outputs, array_ops.pack(out_shape)) outputs.set_shape(static_shape) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Code modification of tensorflow/contrib/layers/python/layers/layers.py """ with variable_scope.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype axis = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined last dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections, 'gamma') gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropiate collections. moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=init_ops.zeros_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=init_ops.ones_initializer, trainable=False, collections=moving_variance_collections) # Calculate the moments based on the individual batch. mean, variance = nn.moments(inputs, axis, shift=moving_mean) # Update the moving_mean and moving_variance moments. update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) if updates_collections is None: # Make sure the updates are computed here. with ops.control_dependencies([update_moving_mean, update_moving_variance]): outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) else: # Collect the updates to be computed later. ops.add_to_collections(updates_collections, update_moving_mean) ops.add_to_collections(updates_collections, update_moving_variance) outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) test_outputs = nn.batch_normalization( inputs, moving_mean, moving_variance, beta, gamma, epsilon) outputs = tf.cond(is_training, lambda: outputs, lambda: test_outputs) outputs.set_shape(inputs_shape) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def batch_norm_mine_old(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, fused=False, data_format=DATA_FORMAT_NHWC, zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99): """ This earlier version of my modification to batch norm uses current_mean and current_variance if is_training is True and moving_mean and moving_variance otherwise. This was leading a large divergence between the results depending upon whether the is_training set to True or not. I think ideally it should always use moving_mean and moving_variance. batch_norm_mine does this. Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. copy of tensorflow.contrib.layers Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `batch_weights` is not None and `fused` is True. ValueError: If `param_regularizers` is not None and `fused` is True. ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ if fused: if batch_weights is not None: raise ValueError('Weighted mean and variance is not currently ' 'supported for fused batch norm.') if param_regularizers is not None: raise ValueError('Regularizers are not currently ' 'supported for fused batch norm.') if renorm: raise ValueError('Renorm is not supported for fused batch norm.') return _fused_batch_norm( inputs, decay=decay, center=center, scale=scale, epsilon=epsilon, activation_fn=activation_fn, param_initializers=param_initializers, updates_collections=updates_collections, is_training=is_training, reuse=reuse, variables_collections=variables_collections, outputs_collections=outputs_collections, trainable=trainable, data_format=data_format, zero_debias_moving_mean=zero_debias_moving_mean, scope=scope) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') layer_variable_getter = _build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Determine whether we can use the core layer class. if (batch_weights is None and updates_collections is ops.GraphKeys.UPDATE_OPS and not zero_debias_moving_mean): # Use the core layer class. axis = 1 if data_format == DATA_FORMAT_NCHW else -1 if not param_initializers: param_initializers = {} beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not param_regularizers: param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') layer = normalization_layers.BatchNormalization( axis=axis, momentum=decay, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=trainable, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_decay, name=sc.name, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs, training=is_training) # Add variables to collections. _add_variable_to_collections( layer.moving_mean, variables_collections, 'moving_mean') _add_variable_to_collections( layer.moving_variance, variables_collections, 'moving_variance') if layer.beta: _add_variable_to_collections(layer.beta, variables_collections, 'beta') if layer.gamma: _add_variable_to_collections( layer.gamma, variables_collections, 'gamma') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs) # Not supported by layer class: batch_weights argument, # and custom updates_collections. In that case, use the legacy BN # implementation. # Custom updates collections are not supported because the update logic # is different in this case, in particular w.r.t. "forced updates" and # update op reuse. if renorm: raise ValueError('renorm is not supported with batch_weights, ' 'updates_collections or zero_debias_moving_mean') inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if batch_weights is not None: batch_weights = ops.convert_to_tensor(batch_weights) inputs_shape[0:1].assert_is_compatible_with(batch_weights.get_shape()) # Reshape batch weight values so they broadcast across inputs. nshape = [-1] + [1 for _ in range(inputs_rank - 1)] batch_weights = array_ops.reshape(batch_weights, nshape) if data_format == DATA_FORMAT_NCHW: moments_axes = [0] + list(range(2, inputs_rank)) params_shape = inputs_shape[1:2] # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list( [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: moments_axes = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] params_shape_broadcast = None if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if not param_initializers: param_initializers = {} if center: beta_collections = utils.get_variable_collections(variables_collections, 'beta') beta_initializer = param_initializers.get('beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections(variables_collections, 'gamma') gamma_initializer = param_initializers.get('gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropriate collections. We disable variable partitioning while creating # them, because assign_moving_average is not yet supported for partitioned # variables. partitioner = variable_scope.get_variable_scope().partitioner try: variable_scope.get_variable_scope().set_partitioner(None) moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=moving_mean_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=moving_variance_initializer, trainable=False, collections=moving_variance_collections) finally: variable_scope.get_variable_scope().set_partitioner(partitioner) # If `is_training` doesn't have a constant value, because it is a `Tensor`, # a `Variable` or `Placeholder` then is_training_value will be None and # `needs_moments` will be true. is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: if data_format == DATA_FORMAT_NCHW: mean, _ = nn.moments(inputs, moments_axes, keep_dims=True) variance,_ = nn.moments( (inputs-moving_mean)**2, moments_axes, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, _ = nn.moments(inputs, moments_axes) variance, _ = nn.moments( (inputs-moving_mean)**2, moments_axes) else: if data_format == DATA_FORMAT_NCHW: mean, _ = nn.weighted_moments(inputs, moments_axes, batch_weights, keep_dims=True) variance, _ = nn.weighted_moments( (inputs-moving_mean)**2, moments_axes, batch_weights, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, _ = nn.weighted_moments(inputs, moments_axes, batch_weights) variance, _ = nn.weighted_moments( (inputs-moving_mean)**2, moments_axes, batch_weights) moving_vars_fn = lambda: (moving_mean, moving_variance) if updates_collections is None: def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies([update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity(variance) mean, variance = utils.smart_cond(is_training, _force_updates, moving_vars_fn) else: def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance update_mean, update_variance = utils.smart_cond(is_training, _delay_updates, moving_vars_fn) ops.add_to_collections(updates_collections, update_mean) ops.add_to_collections(updates_collections, update_variance) # Use computed moments during training and moving_vars otherwise. vars_fn = lambda: (mean, variance) mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn) else: mean, variance = moving_mean, moving_variance if data_format == DATA_FORMAT_NCHW: mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Compute batch_normalization. outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def convolution2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution2d` creates a variable called `weights`, representing the convolutional kernel, that is convolved with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Args: inputs: a 4-D tensor `[batch_size, height, width, channels]`. num_outputs: integer, the number of output filters. kernel_size: a list of length 2 `[kernel_height, kernel_width]` of of the filters. Can be an int if both values are the same. stride: a list of length 2 `[stride_height, stride_width]`. Can be an int if both strides are the same. Note that presently both strides must have the same value. padding: one of `VALID` or `SAME`. activation_fn: activation function. normalizer_fn: normalization function to use instead of `biases`. If `normalize_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. normalizer_params: normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: optional list of collections for all the variables or a dictionay containing a different list of collection per variable. outputs_collections: collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_op_scope`. Returns: a tensor representing the output of the operation. """ with variable_scope.variable_op_scope([inputs], scope, 'Conv', reuse=reuse) as sc: dtype = inputs.dtype.base_dtype kernel_h, kernel_w = utils.two_element_tuple(kernel_size) stride_h, stride_w = utils.two_element_tuple(stride) num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4) weights_shape = [kernel_h, kernel_w, num_filters_in, num_outputs] weights_collections = utils.get_variable_collections( variables_collections, 'weights') weights = variables.model_variable('weights', shape=weights_shape, dtype=dtype, initializer=weights_initializer, regularizer=weights_regularizer, collections=weights_collections, trainable=trainable) outputs = nn.conv2d(inputs, weights, [1, stride_h, stride_w, 1], padding=padding) if normalizer_fn: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases_collections = utils.get_variable_collections( variables_collections, 'biases') biases = variables.model_variable('biases', shape=[num_outputs,], dtype=dtype, initializer=biases_initializer, regularizer=biases_regularizer, collections=biases_collections, trainable=trainable) outputs = nn.bias_add(outputs, biases) if activation_fn: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def masked_fully_connected( inputs, num_outputs, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a sparse fully connected layer. The weight matrix is masked. `fully_connected` creates a variable called `weights`, representing a fully connected weight matrix, which is multiplied by the `inputs` to produce a `Tensor` of hidden units. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the hidden units. Finally, if `activation_fn` is not `None`, it is applied to the hidden units as well. Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened prior to the initial matrix multiply by `weights`. Args: inputs: A tensor of at least rank 2 and static value for the last dimension; i.e. `[batch_size, depth]`, `[None, None, None, channels]`. num_outputs: Integer or long, the number of output units in the layer. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collections per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: The tensor variable representing the result of the series of operations. Raises: ValueError: If x has rank less than 2 or if its last dimension is not set. """ if not isinstance(num_outputs, six.integer_types): raise ValueError('num_outputs should be int or long, got %s.' % (num_outputs,)) layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'fully_connected', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) layer = core.MaskedFullyConnected( units=num_outputs, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.bias is not None: _add_variable_to_collections(layer.bias, variables_collections, 'biases') # Apply normalizer function / layer. if normalizer_fn is not None: if not normalizer_params: normalizer_params = {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, name="decoder", save_weights_to=None, make_image_summary=True,): """A stack of transformer layers. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = decoder_input with tf.variable_scope(name): for layer in range(hparams.num_decoder_layers or hparams.num_hidden_layers): layer_name = "layer_%d" % layer with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, ) utils.collect_named_outputs("norms", "decoder_self_attention_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs("norms", "decoder_self_attention_post_%d"%(layer), tf.norm(x, axis=-1)) if encoder_output is not None: with tf.variable_scope("encdec_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, make_image_summary=make_image_summary, ) utils.collect_named_outputs( "norms", "decoder_encoder_attention_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "decoder_encoder_attention_post_%d"%(layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs("norms", "decoder_ffn_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs("norms", "decoder_ffn_post_%d"%(layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.image_hidden_size or hp.hidden_size if hp.image_feat_preprocess_proj: image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) else: assert image_hidden_size == 2048 image_feat = tf.nn.dropout( image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout) if hp.image_feat_encode: image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) else: image_feat = common_layers.layer_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_layer", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) question, question_self_attention_bias = prepare_question_encoder( question, hp) question = tf.nn.dropout( question, keep_prob=1.-hp.layer_prepostprocess_dropout) query = question_encoder(question, question_self_attention_bias, hp) utils.collect_named_outputs( "norms", "query_encode", tf.norm(query, axis=-1)) query = (query + tf.expand_dims( tf.squeeze(question_self_attention_bias, [1, 2]), axis=2)) query = tf.reduce_max(query, axis=1) utils.collect_named_outputs( "norms", "query_maxpool", tf.norm(query, axis=-1)) # query = common_layers.l2_norm(query) # utils.collect_named_outputs("norms", "query_after_l2", # tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) if hp.multimodal_combine == "concat": image_question = tf.concat([image_ave, query], axis=1) elif hp.multimodal_combine == "sum": image_question = image_ave + query elif hp.multimodal_combine == "product": image_question = image_ave * query utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def masked_convolution(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds an 2D convolution followed by an optional batch_norm layer. The layer creates a mask variable on top of the weight variable. The input to the convolution operation is the elementwise multiplication of the mask variable and the weigh It is required that 1 <= N <= 3. `convolution` creates a variable called `weights`, representing the convolutional kernel, that is convolved (actually cross-correlated) with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs atrous convolution with input stride/dilation rate equal to `rate` if a value > 1 for any dimension of `rate` is specified. In this case `stride` values != 1 are not supported. Args: inputs: A Tensor of rank N+2 of shape `[batch_size] + input_spatial_shape + [in_channels]` if data_format does not start with "NC" (default), or `[batch_size, in_channels] + input_spatial_shape` if data_format starts with "NC". num_outputs: Integer, the number of output filters. kernel_size: A sequence of N positive integers specifying the spatial dimensions of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: A sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `stride` value != 1 is incompatible with specifying any `rate` value != 1. padding: One of `"VALID"` or `"SAME"`. data_format: A string or None. Specifies whether the channel dimension of the `input` and output is the last dimension (default, or if `data_format` does not start with "NC"), or the second dimension (if `data_format` starts with "NC"). For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW". For N=3, the valid values are "NDHWC" (default) and "NCDHW". rate: A sequence of N positive integers specifying the dilation rate to use for atrous convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `rate` value != 1 is incompatible with specifying any `stride` value != 1. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A tensor representing the output of the operation. Raises: ValueError: If `data_format` is invalid. ValueError: Both 'rate' and `stride` are not uniformly 1. """ if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']: raise ValueError('Invalid data_format: %r' % (data_format,)) layer_variable_getter = _build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) with variable_scope.variable_scope( scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank == 3: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) elif input_rank == 4: layer_class = core.MaskedConv2D elif input_rank == 5: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) else: raise ValueError('Sparse Convolution not supported for input with rank', input_rank) if data_format is None or data_format == 'NHWC': df = 'channels_last' elif data_format == 'NCHW': df = 'channels_first' else: raise ValueError('Unsupported data format', data_format) layer = layer_class( filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=df, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. _add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: _add_variable_to_collections(layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def instance_norm(inputs, center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, data_format=DATA_FORMAT_NHWC, scope=None): """Functional interface for the instance normalization layer. Reference: https://arxiv.org/abs/1607.08022. "Instance Normalization: The Missing Ingredient for Fast Stylization" Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). data_format: A string. `NHWC` (default) and `NCHW` are supported. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ inputs = ops.convert_to_tensor(inputs) inputs_shape = inputs.shape inputs_rank = inputs.shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): raise ValueError('data_format has to be either NCHW or NHWC.') with variable_scope.variable_scope( scope, 'InstanceNorm', [inputs], reuse=reuse) as sc: if data_format == DATA_FORMAT_NCHW: reduction_axis = 1 # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list( [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: reduction_axis = inputs_rank - 1 params_shape_broadcast = None moments_axes = list(range(inputs_rank)) del moments_axes[reduction_axis] del moments_axes[0] params_shape = inputs_shape[reduction_axis:reduction_axis + 1] if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % ( inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if params_shape_broadcast: beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) if params_shape_broadcast: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments (instance activations). mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute instance normalization. outputs = nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon, name='instancenorm') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def group_norm(inputs, groups=32, channels_axis=-1, reduction_axes=(-3, -2), center=True, scale=True, epsilon=1e-6, activation_fn=None, param_initializers=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, mean_close_to_zero=False): """Functional interface for the group normalization layer. Reference: https://arxiv.org/abs/1803.08494. "Group Normalization", Yuxin Wu, Kaiming He Args: inputs: A Tensor with at least 2 dimensions one which is channels. All shape dimensions must be fully defined. groups: Integer. Divide the channels into this number of groups over which normalization statistics are computed. This number must be commensurate with the number of channels in `inputs`. channels_axis: An integer. Specifies index of channels axis which will be broken into `groups`, each of which whose statistics will be computed across. Must be mutually exclusive with `reduction_axes`. Preferred usage is to specify negative integers to be agnostic as to whether a batch dimension is included. reduction_axes: Tuple of integers. Specifies dimensions over which statistics will be accumulated. Must be mutually exclusive with `channels_axis`. Statistics will not be accumulated across axes not specified in `reduction_axes` nor `channel_axis`. Preferred usage is to specify negative integers to be agnostic to whether a batch dimension is included. Some sample usage cases: NHWC format: channels_axis=-1, reduction_axes=[-3, -2] NCHW format: channels_axis=-3, reduction_axes=[-2, -1] center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). scope: Optional scope for `variable_scope`. mean_close_to_zero: The mean of `input` before ReLU will be close to zero when batch size >= 4k for Resnet-50 on TPU. If `True`, use `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the variance. This is the same behavior as `fused` equals `True` in batch normalization. If `False`, use `nn.moments` to calculate the variance. When `mean` is close to zero, like 1e-4, use `mean` to calculate the variance may have poor result due to repeated roundoff error and denormalization in `mean`. When `mean` is large, like 1e2, sum(`input`^2) is so large that only the high-order digits of the elements are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2) when `mean` is large. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. ValueError: If number of groups is not commensurate with number of channels. ValueError: If reduction_axes or channels_axis are out of bounds. ValueError: If reduction_axes are not mutually exclusive with channels_axis. """ # TODO(shlens): Support partially defined shapes for the inputs. inputs = ops.convert_to_tensor(inputs) original_shape = inputs.shape if inputs.shape.ndims is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) if channels_axis > (inputs.shape.ndims - 1): raise ValueError('Axis is out of bounds.') # Standardize the channels_axis to be positive and identify # of channels. if channels_axis < 0: channels_axis = inputs.shape.ndims + channels_axis channels = inputs.shape[channels_axis].value if channels is None: raise ValueError('Inputs %s has undefined channel dimension: %d.' % ( inputs.name, channels_axis)) # Standardize the reduction_axes to be positive. reduction_axes = list(reduction_axes) for i in range(len(reduction_axes)): if reduction_axes[i] < 0: reduction_axes[i] += inputs.shape.ndims for a in reduction_axes: if a > inputs.shape.ndims: raise ValueError('Axis is out of bounds.') if inputs.shape[a].value is None: raise ValueError('Inputs %s has undefined dimensions %d.' % ( inputs.name, a)) if channels_axis == a: raise ValueError('reduction_axis must be mutually exclusive ' 'with channels_axis') if groups > channels: raise ValueError('Invalid groups %d for %d channels.' % (groups, channels)) if channels % groups != 0: raise ValueError('%d channels is not commensurate with %d groups.' % (channels, groups)) # Determine axes before channels. Some examples of common image formats: # 'NCHW': before = [N], after = [HW] # 'NHWC': before = [NHW], after = [] axes_before_channels = inputs.shape.as_list()[:channels_axis] axes_after_channels = inputs.shape.as_list()[channels_axis+1:] # Manually broadcast the parameters to conform to the number of groups. params_shape_broadcast = ([1] * len(axes_before_channels) + [groups, channels // groups] + [1] * len(axes_after_channels)) # Reshape the input by the group within the channel dimension. inputs_shape = (axes_before_channels + [groups, channels // groups] + axes_after_channels) inputs = array_ops.reshape(inputs, inputs_shape) # Determine the dimensions across which moments are calculated. moments_axes = [channels_axis + 1] for a in reduction_axes: if a > channels_axis: moments_axes.append(a + 1) else: moments_axes.append(a) with variable_scope.variable_scope( scope, 'GroupNorm', [inputs], reuse=reuse) as sc: # Note that the params_shape is the number of channels always. params_shape = [channels] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None dtype = inputs.dtype.base_dtype if param_initializers is None: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) beta = array_ops.reshape(beta, params_shape_broadcast) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) gamma = array_ops.reshape(gamma, params_shape_broadcast) # Calculate the moments. if mean_close_to_zero: # One pass algorithm returns better result when mean is close to zero. counts, means_ss, variance_ss, _ = nn.sufficient_statistics( inputs, moments_axes, keep_dims=True) mean, variance = nn.normalize_moments( counts, means_ss, variance_ss, shift=None) else: mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) # Compute normalization. # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor # appropriately so that this operation may be faster. gain = math_ops.rsqrt(variance + epsilon) offset = -mean * gain if gamma is not None: gain *= gamma offset *= gamma if beta is not None: offset += beta outputs = inputs * gain + offset # Collapse the groups into the channel dimension. outputs = array_ops.reshape(outputs, original_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
def stack_blocks_dense(net, blocks, output_stride=None, outputs_collections=None): """Stacks ResNet `Blocks` and controls output feature density. First, this function creates scopes for the ResNet in the form of 'block_name/unit_1', 'block_name/unit_2', etc. Second, this function allows the user to explicitly control the ResNet output_stride, which is the ratio of the input to output spatial resolution. This is useful for dense prediction tasks such as semantic segmentation or object detection. Most ResNets consist of 4 ResNet blocks and subsample the activations by a factor of 2 when transitioning between consecutive ResNet blocks. This results to a nominal ResNet output_stride equal to 8. If we set the output_stride to half the nominal network stride (e.g., output_stride=4), then we compute responses twice. Control of the output feature density is implemented by atrous convolution. Args: net: A `Tensor` of size [batch, height, width, channels]. blocks: A list of length equal to the number of ResNet `Blocks`. Each element is a ResNet `Block` object describing the units in the `Block`. output_stride: If `None`, then the output will be computed at the nominal network stride. If output_stride is not `None`, it specifies the requested ratio of input to output spatial resolution, which needs to be equal to the product of unit strides from the start up to some level of the ResNet. For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, then valid values for the output_stride are 1, 2, 6, 24 or None (which is equivalent to output_stride=24). outputs_collections: Collection to add the ResNet block outputs. Returns: net: Output tensor with stride equal to the specified output_stride. Raises: ValueError: If the target output_stride is not valid. """ # The current_stride variable keeps track of the effective stride of the # activations. This allows us to invoke atrous convolution whenever applying # the next residual unit would result in the activations having stride larger # than the target output_stride. current_stride = 1 # The atrous convolution rate parameter. rate = 1 for block in blocks: with variable_scope.variable_scope(block.scope, 'block', [net]) as sc: for i, unit in enumerate(block.args): if output_stride is not None and current_stride > output_stride: raise ValueError('The target output_stride cannot be reached.') with variable_scope.variable_scope('unit_%d' % (i + 1), values=[net]): # If we have reached the target output_stride, then we need to employ # atrous convolution with stride=1 and multiply the atrous rate by the # current unit's stride for use in subsequent layers. if output_stride is not None and current_stride == output_stride: net = block.unit_fn(net, rate=rate, **dict(unit, stride=1)) rate *= unit.get('stride', 1) else: net = block.unit_fn(net, rate=1, **unit) current_stride *= unit.get('stride', 1) net = utils.collect_named_outputs(outputs_collections, sc.name, net) if output_stride is not None and current_stride != output_stride: raise ValueError('The target output_stride cannot be reached.') return net
def test_collect(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a2', t2) self.assertEqual(ops.get_collection('end_points'), [t1, t2])