def _factorized_reduction(scope_name, x, out_filters, data_format): """Reduces the shape of x without information loss due to striding. copied from https://github.com/melodyguan/enas/blob/master/src/cifar10/general_child.py """ assert out_filters % 2 == 0, ( "Need even number of filters when using this factorized reduction.") #with tf.variable_scope(scope_name): # layer = Conv2D('conv3x3_path', x, out_filters, 3, strides=2, activation=BNReLU) #return layer with tf.variable_scope(scope_name): path1 = AvgPooling('path1', x, pool_size=1, strides=2, padding='valid') path1 = Conv2D('path1_conv', path1, out_filters // 2, 1, padding='same') # Skip path 2 # First pad with 0"s on the right and bottom, then shift the filter to # include those 0"s that were added. data_format = get_data_format(data_format, keras_mode=False) if data_format == "NHWC": pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :] ch_dim = 3 else: pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]] path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:] ch_dim = 1 path2 = AvgPooling('path2', x, pool_size=1, strides=2, padding='valid') path2 = Conv2D('path2_conv', path1, out_filters // 2, 1, padding='same') final_path = tf.concat(values=[path1, path2], axis=ch_dim) final_path = BatchNorm('bn', final_path) return final_path
def residual_layer(name, l, out_filters, strides, data_format): ch_out = out_filters data_format = get_data_format(data_format, keras_mode=False) ch_dim = 3 if data_format == 'NHWC' else 1 ch_in = _get_dim(l, ch_dim) l_in = l with tf.variable_scope('{}.0'.format(name)): l = BNReLU(l) l = SeparableConv2D('conv1', l, ch_out, 3, strides=strides, activation=BNReLU) l = SeparableConv2D('conv2', l, ch_out, 3) # The second conv need to be BN before addition. l = BatchNorm('bn2', l) shortcut = l_in if strides > 1: shortcut = AvgPooling('pool', shortcut, 2) if ch_in < ch_out: pad_paddings = [[0, 0], [0, 0], [0, 0], [0, 0]] pad_width = (ch_out - ch_in) pad_paddings[ch_dim] = [0, pad_width] shortcut = tf.pad(shortcut, pad_paddings) elif ch_in > ch_out: if data_format == 'NHWC': shortcut1 = shortcut[:, :, :, :ch_out] shortcut2 = shortcut[:, :, :, ch_out:] else: shortcut1 = shortcut[:, :ch_out, :, :] shortcut2 = shortcut[:, ch_out:, :, :] shortcut2 = Conv2D('conv_short', shortcut2, ch_out, 1, strides=strides) shortcut2 = BatchNorm('bn_short', shortcut2) shortcut = shortcut1 + shortcut2 l += shortcut return l
def InstanceNorm5d(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'): """ Instance Normalization, as in the paper: `Instance Normalization: The Missing Ingredient for Fast Stylization <https://arxiv.org/abs/1607.08022>`_. Args: x (tf.Tensor): a 4D tensor. epsilon (float): avoid divide-by-zero use_affine (bool): whether to apply learnable affine transformation """ data_format = get_data_format(data_format, keras_mode=True) shape = x.get_shape().as_list() # assert len(shape) == 4, "Input of InstanceNorm has to be 4D!" if len(shape) == 5: if data_format == 'NHWC': axis = [1, 2, 3] ch = shape[4] new_shape = [1, 1, 1, 1, ch] else: axis = [2, 3, 4] ch = shape[1] new_shape = [1, ch, 1, 1, 1] else: if data_format == 'NHWC': axis = [1, 2] ch = shape[3] new_shape = [1, 1, 1, ch] else: axis = [2, 3] ch = shape[1] new_shape = [1, ch, 1, 1] assert ch is not None, "Input of InstanceNorm require known channel!" mean, var = tf.nn.moments(x, axis, keep_dims=True) if not use_affine: return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output') beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer()) beta = tf.reshape(beta, new_shape) if gamma_init is None: gamma_init = tf.constant_initializer(1.0) gamma = tf.get_variable('gamma', [ch], initializer=gamma_init) gamma = tf.reshape(gamma, new_shape) ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output') vh = ret.variables = VariableHolder() if use_affine: vh.gamma = gamma vh.beta = beta return ret
def map_common_tfargs(kwargs): df = kwargs.pop('data_format', None) if df is not None: df = get_data_format(df, tfmode=True) kwargs['data_format'] = df old_nl = kwargs.pop('nl', None) if old_nl is not None: kwargs['activation'] = lambda x, name=None: old_nl(x, name=name) if 'W_init' in kwargs: kwargs['kernel_initializer'] = kwargs.pop('W_init') if 'b_init' in kwargs: kwargs['bias_initializer'] = kwargs.pop('b_init') return kwargs
def InstanceNorm(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'): data_format = get_data_format(data_format, tfmode=False) shape = x.get_shape().as_list() if len(shape) == 5: if data_format == 'NHWC': axis = [1, 2, 3] ch = shape[4] new_shape = [1, 1, 1, 1, ch] else: axis = [2, 3, 4] ch = shape[1] new_shape = [1, ch, 1, 1, 1] else: if data_format == 'NHWC': axis = [1, 2] ch = shape[3] new_shape = [1, 1, 1, ch] else: axis = [2, 3] ch = shape[1] new_shape = [1, ch, 1, 1] assert ch is not None, mean, var = tf.nn.moments(x, axis, keep_dims=True) if not use_affine: return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output') beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer()) beta = tf.reshape(beta, new_shape) if gamma_init is None: gamma_init = tf.constant_initializer(1.0) gamma = tf.get_variable('gamma', [ch], initializer=gamma_init) gamma = tf.reshape(gamma, new_shape) ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output') vh = ret.variables = VariableHolder() if use_affine: vh.gamma = gamma vh.beta = beta return ret
def residual_bottleneck_layer(name, l, out_filters, strides, data_format): data_format = get_data_format(data_format, keras_mode=False) ch_dim = 3 if data_format == 'NHWC' else 1 ch_in = _get_dim(l, ch_dim) ch_base = out_filters ch_last = ch_base * 4 l_in = l with tf.variable_scope('{}.0'.format(name)): l = BatchNorm('bn0', l) l = tf.nn.relu(l) l = (LinearWrap(l) .Conv2D('conv1x1_0', ch_base, 1, activation=BNReLU) .Conv2D('conv3x3_1', ch_base, 3, strides=strides, activation=BNReLU) .Conv2D('conv1x1_2', ch_last, 1)()) l = BatchNorm('bn_3', l) shortcut = l_in if ch_in != ch_last: shortcut = Conv2D('conv_short', shortcut, ch_last, 1, strides=strides) shortcut = BatchNorm('bn_short', shortcut) l = l + shortcut return l
def Conv3D( inputs, filters, kernel_size, strides=(1, 1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1, 1), activation=None, use_bias=True, kernel_initializer=tf.contrib.layers.variance_scaling_initializer(2.0), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1): """ A wrapper around `tf.layers.Conv2D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if split == 1: with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv3D(filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: # group conv implementation data_format = get_data_format(data_format, tfmode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv3D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv now!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == (1, 1) or get_tf_version_number( ) >= 1.5, 'TF>=1.5 required for group dilated conv' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_number() >= 1.5: kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable('W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [ tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs) for i, k in zip(inputs, kernels) ] conv = tf.concat(outputs, channel_axis) if activation is None: activation = tf.identity ret = activation(tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def BatchNorm3d(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` can be used when you have conditionals in your model, or when you have multiple networks to train. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics: either None or "nccl". By default (None), it uses statistics of the input tensor to normalize. When set to "nccl", this layer must be used under tensorpack multi-gpu trainers, and it then uses per-machine (multiple GPU) statistics to normalize. Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute global mean&variance. The result is the global mean&variance only if each tower has the same batch size. This option has no effect when not training. This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222 Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) # in 3d conv, we have 5d dim [batch, c, d, h, w] # assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 elif ndims == 5: axis = 1 if data_format == 'NCHW' else 4 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): tf_args = dict(axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True, _reuse=tf.get_variable_scope().reuse) if TF_version >= 1.5: tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ( [0, 2, 3] if axis == 1 else [0, 1, 2]) if ndims == 5: red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3] new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] if ndims == 5 and axis == 1: new_shape = [1, num_chan, 1, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower: logger.warn( "A TensorFlow bug will cause cross-GPU BatchNorm to fail. " "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360" ) from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 # Proof-of-concept, not ready yet. import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def Conv(inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, norm=False): """ Similar to `tf.layers.Conv2D`, but with some differences: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group convolution. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer( 2.0) # deprecated else: kernel_initializer = tf.keras.initializers.VarianceScaling( 2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) if True: # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= ( 1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel // split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = {"data_format": data_format} if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) # matching input dtype (ex. tf.float16) since the default dtype of variable if tf.float32 inputs_dtype = inputs.dtype W = tf.get_variable('parseweigth', filter_shape, dtype=inputs_dtype, initializer=kernel_initializer) if norm: use_bias = False W = tf.reshape(W, kernel_shape + [4, in_channel // 4, out_channel]) W = tf.nn.softmax(W, 2) W = tf.reshape(W, filter_shape) #dynamics = tf.reduce_mean(inputs, 0) #dynamics = tf.transpose(dynamics, [1,2,0]) #dynamics = tf.image.resize_images(dynamics, kernel_shape) #dynamics = tf.expand_dims(dynamics, -1) #W = W + 0.001 * dynamics #tf.random_normal(shape = tf.shape(W), mean = 0.0, stddev = 0.012, dtype = tf.float32) #W = W *tf.random_uniform(shape=W.get_shape().as_list(), minval=0., maxval=2.) if use_bias: b = tf.get_variable('parsebias', [out_channel], dtype=inputs_dtype, initializer=bias_initializer) if split == 1: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) else: try: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) except ValueError: log_once( "CUDNN group convolution support is only available with " "https://github.com/tensorflow/tensorflow/pull/25818 . " "Will fall back to a loop-based slow implementation instead!", 'warn') ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def mpusim_depthwise_convolution2d(inputs, kernel_size, strides=(1, 1), padding='valid', depth_multiplier=1, data_format='channels_last', activation=None, use_bias=True, depthwise_initializer='glorot_uniform', bias_initializer='zeros', depthwise_regularizer=None, bias_regularizer=None, depthwise_constraint=None, bias_constraint=None, activations_datatype_size_byte=1, weights_datatype_size_byte=1, results_datatype_size_byte=4, systolic_array_height=256, systolic_array_width=256, activation_fifo_depth=8, accumulator_array_height=4096, log_file_output_dir='.', model_name='unnamed'): #depthwise_initializer = initializers.get(depthwise_initializer) #depthwise_regularizer = regularizers.get(depthwise_regularizer) #depthwise_constraint = constraints.get(depthwise_constraint) #bias_initializer = initializers.get(bias_initializer) data_format = get_data_format(data_format, keras_mode=False) input_shape = inputs.get_shape().as_list() strides = shape4d(strides, data_format=data_format) if len(input_shape) < 4: raise ValueError( 'Inputs to `mpusim_depthwise_conv2d` should have rank 4. ' 'Received input shape:', str(input_shape)) if data_format == 'NCHW': raise ValueError('mpusim_depthwise_convolution2d ' 'only supports NHWC data format') else: channel_axis = 3 if input_shape[channel_axis] is None: raise ValueError('The channel dimension of the inputs to ' '`mpusim_depthwise_convolution2d` ' 'should be defined. Found `None`.') input_dim = int(input_shape[channel_axis]) depthwise_kernel_shape = (kernel_size[0], kernel_size[1], input_dim, depth_multiplier) depthwise_kernel = tf.get_variable('W', shape=depthwise_kernel_shape, initializer=depthwise_initializer, regularizer=depthwise_regularizer, constraint=depthwise_constraint) if use_bias: biases = tf.get_variable('b', shape=(input_dim * depth_multiplier, ), initializer=bias_initializer, regularizer=bias_regularizer, constraint=bias_constraint) result = mpusim_depthwise_conv2d( inputs, depthwise_kernel, strides=strides, padding=padding, data_format=data_format, activations_datatype_size_byte=activations_datatype_size_byte, weights_datatype_size_byte=weights_datatype_size_byte, results_datatype_size_byte=results_datatype_size_byte, systolic_array_height=systolic_array_height, systolic_array_width=systolic_array_width, activation_fifo_depth=activation_fifo_depth, accumulator_array_height=accumulator_array_height, log_file_output_dir=log_file_output_dir, model_name=model_name) if use_bias: result = tf.nn.bias_add(result, bias, data_format=data_format) if activation is not None: result = activation(result) result = tf.identity(result, name='output') result.variables = VariableHolder(W=depthwise_kernel) if use_bias: result.variables.b = biases return result
def MaskedConv2D( inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, masking=False): """ A wrapper around `tf.layers.Conv2D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) if (masking == False) and (split == 1) and (dilation_rate == [1, 1]): # tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797) with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv2D( filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, _reuse=tf.get_variable_scope().reuse) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: if masking == True: assert split == 1, "Pruining group conv is not supported yet" # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable( 'W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) if split == 1: if masking: W = pruning.apply_mask(W) conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) else: conv = None if get_tf_version_tuple() >= (1, 13): try: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) except ValueError: log_once("CUDNN group convolution support is only available with " "https://github.com/tensorflow/tensorflow/pull/25818 . " "Will fall back to a loop-based slow implementation instead!", 'warn') if conv is None: inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs) for i, k in zip(inputs, kernels)] conv = tf.concat(outputs, channel_axis) ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret
def mpusim_conv2d( inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, activations_datatype_size_byte=1, weights_datatype_size_byte=1, results_datatype_size_byte=4, systolic_array_height=256, systolic_array_width=256, activation_fifo_depth=8, accumulator_array_height=4096, log_file_output_dir='.', model_name='unnamed'): """ Similar to `tf.layers.Conv2D`, but with some differences: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group convolution. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[mpusim_conv2d] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable( 'W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) if split == 1: conv = mpu_sim_conv2d_lib.mpu_sim_conv2d(inputs, W, activations_datatype_size_byte, weights_datatype_size_byte, results_datatype_size_byte, systolic_array_height, systolic_array_width, activation_fifo_depth, accumulator_array_height, log_file_output_dir, model_name, stride, padding.upper(), **kwargs) else: inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [mpu_sim_conv2d_lib.mpu_sim_conv2d(input_block, kernel_block, activations_datatype_size_byte, weights_datatype_size_byte, results_datatype_size_byte, systolic_array_height, systolic_array_width, activation_fifo_depth, accumulator_array_height, log_file_output_dir, model_name, stride, padding.upper(), **kwargs) for input_block, kernel_block in zip(inputs, kernels)] conv = tf.concat(outputs, channel_axis) ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b=b return ret
def _data_format_to_ch_dim(data_format): data_format = get_data_format(data_format, keras_mode=False) ch_dim = -1 if data_format == 'NHWC' else 1 return ch_dim
def BatchNorm3d(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 elif ndims == 5: axis = 1 if data_format == 'NCHW' else 4 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() if not training and ctx.is_training: assert TF_version >= 1.4 if ctx.is_main_training_tower: logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True, _reuse=tf.get_variable_scope().reuse) if TF_version >= 1.5: tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, #backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) #backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2]) if ndims == 5: red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3] new_shape = None if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] if ndims == 5 and axis == 1: new_shape = [1, num_chan, 1, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower: logger.warn("A TensorFlow bug cusing cross-GPU BatchNorm to fail") from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema( xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, moving_variance=moving_var, variance=moving_var) if scale: vh.gamma = gamma if center: vh.beta = beta return ret