def _factorized_reduction(scope_name, x, out_filters, data_format):
    """Reduces the shape of x without information loss due to striding.
    copied from https://github.com/melodyguan/enas/blob/master/src/cifar10/general_child.py
    """
    assert out_filters % 2 == 0, (
        "Need even number of filters when using this factorized reduction.")

    #with tf.variable_scope(scope_name):
    #    layer = Conv2D('conv3x3_path', x, out_filters, 3, strides=2, activation=BNReLU)
    #return layer

    with tf.variable_scope(scope_name):
        path1 = AvgPooling('path1', x, pool_size=1, strides=2, padding='valid')
        path1 = Conv2D('path1_conv', path1, out_filters // 2, 1, padding='same')

        # Skip path 2
        # First pad with 0"s on the right and bottom, then shift the filter to
        # include those 0"s that were added.
        data_format = get_data_format(data_format, keras_mode=False)

        if data_format == "NHWC":
            pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
            path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :]
            ch_dim = 3
        else:
            pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]]
            path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:]
            ch_dim = 1

        path2 = AvgPooling('path2', x, pool_size=1, strides=2, padding='valid')
        path2 = Conv2D('path2_conv', path1, out_filters // 2, 1, padding='same')

        final_path = tf.concat(values=[path1, path2], axis=ch_dim)
        final_path = BatchNorm('bn', final_path)
        return final_path
def residual_layer(name, l, out_filters, strides, data_format):
    ch_out = out_filters
    data_format = get_data_format(data_format, keras_mode=False)
    ch_dim = 3 if data_format == 'NHWC' else 1
    ch_in = _get_dim(l, ch_dim)

    l_in = l
    with tf.variable_scope('{}.0'.format(name)):
        l = BNReLU(l)
        l = SeparableConv2D('conv1', l, ch_out, 3, strides=strides, activation=BNReLU)
        l = SeparableConv2D('conv2', l, ch_out, 3)
        # The second conv need to be BN before addition.
        l = BatchNorm('bn2', l)

        shortcut = l_in
        if strides > 1:
            shortcut = AvgPooling('pool', shortcut, 2)
        if ch_in < ch_out:
            pad_paddings = [[0, 0], [0, 0], [0, 0], [0, 0]]
            pad_width = (ch_out - ch_in)
            pad_paddings[ch_dim] = [0, pad_width]
            shortcut = tf.pad(shortcut, pad_paddings)
        elif ch_in > ch_out:
            if data_format == 'NHWC':
                shortcut1 = shortcut[:, :, :, :ch_out]
                shortcut2 = shortcut[:, :, :, ch_out:]
            else:
                shortcut1 = shortcut[:, :ch_out, :, :]
                shortcut2 = shortcut[:, ch_out:, :, :]
            shortcut2 = Conv2D('conv_short', shortcut2, ch_out, 1, strides=strides)
            shortcut2 = BatchNorm('bn_short', shortcut2)
            shortcut = shortcut1 + shortcut2
        l += shortcut
    return l
Example #3
0
def InstanceNorm5d(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'):
    """
    Instance Normalization, as in the paper:
    `Instance Normalization: The Missing Ingredient for Fast Stylization
    <https://arxiv.org/abs/1607.08022>`_.
    Args:
        x (tf.Tensor): a 4D tensor.
        epsilon (float): avoid divide-by-zero
        use_affine (bool): whether to apply learnable affine transformation
    """
    data_format = get_data_format(data_format, keras_mode=True)
    shape = x.get_shape().as_list()
    # assert len(shape) == 4, "Input of InstanceNorm has to be 4D!"
    if len(shape) == 5:
        if data_format == 'NHWC':
            axis = [1, 2, 3]
            ch = shape[4]
            new_shape = [1, 1, 1, 1, ch]
        else:
            axis = [2, 3, 4]
            ch = shape[1]
            new_shape = [1, ch, 1, 1, 1]
    else:
        if data_format == 'NHWC':
            axis = [1, 2]
            ch = shape[3]
            new_shape = [1, 1, 1, ch]
        else:
            axis = [2, 3]
            ch = shape[1]
            new_shape = [1, ch, 1, 1]
    assert ch is not None, "Input of InstanceNorm require known channel!"

    mean, var = tf.nn.moments(x, axis, keep_dims=True)

    if not use_affine:
        return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output')

    beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer())
    beta = tf.reshape(beta, new_shape)
    if gamma_init is None:
        gamma_init = tf.constant_initializer(1.0)
    gamma = tf.get_variable('gamma', [ch], initializer=gamma_init)
    gamma = tf.reshape(gamma, new_shape)
    ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output')

    vh = ret.variables = VariableHolder()
    if use_affine:
        vh.gamma = gamma
        vh.beta = beta
    return ret
Example #4
0
def map_common_tfargs(kwargs):
    df = kwargs.pop('data_format', None)
    if df is not None:
        df = get_data_format(df, tfmode=True)
        kwargs['data_format'] = df

    old_nl = kwargs.pop('nl', None)
    if old_nl is not None:
        kwargs['activation'] = lambda x, name=None: old_nl(x, name=name)

    if 'W_init' in kwargs:
        kwargs['kernel_initializer'] = kwargs.pop('W_init')

    if 'b_init' in kwargs:
        kwargs['bias_initializer'] = kwargs.pop('b_init')
    return kwargs
Example #5
0
def InstanceNorm(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'):
    data_format = get_data_format(data_format, tfmode=False)
    shape = x.get_shape().as_list()
    if len(shape) == 5:
        if data_format == 'NHWC':
            axis = [1, 2, 3]
            ch = shape[4]
            new_shape = [1, 1, 1, 1, ch]
        else:
            axis = [2, 3, 4]
            ch = shape[1]
            new_shape = [1, ch, 1, 1, 1]
    else:
        if data_format == 'NHWC':
            axis = [1, 2]
            ch = shape[3]
            new_shape = [1, 1, 1, ch]
        else:
            axis = [2, 3]
            ch = shape[1]
            new_shape = [1, ch, 1, 1]
    assert ch is not None,

    mean, var = tf.nn.moments(x, axis, keep_dims=True)

    if not use_affine:
        return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output')

    beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer())
    beta = tf.reshape(beta, new_shape)
    if gamma_init is None:
        gamma_init = tf.constant_initializer(1.0)
    gamma = tf.get_variable('gamma', [ch], initializer=gamma_init)
    gamma = tf.reshape(gamma, new_shape)
    ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output')

    vh = ret.variables = VariableHolder()
    if use_affine:
        vh.gamma = gamma
        vh.beta = beta
    return ret
def residual_bottleneck_layer(name, l, out_filters, strides, data_format):
    data_format = get_data_format(data_format, keras_mode=False)
    ch_dim = 3 if data_format == 'NHWC' else 1
    ch_in = _get_dim(l, ch_dim)

    ch_base = out_filters
    ch_last = ch_base * 4
    l_in = l
    with tf.variable_scope('{}.0'.format(name)):
        l = BatchNorm('bn0', l)
        l = tf.nn.relu(l)
        l = (LinearWrap(l)
             .Conv2D('conv1x1_0', ch_base, 1, activation=BNReLU)
             .Conv2D('conv3x3_1', ch_base, 3, strides=strides, activation=BNReLU)
             .Conv2D('conv1x1_2', ch_last, 1)())
        l = BatchNorm('bn_3', l)

        shortcut = l_in
        if ch_in != ch_last:
            shortcut = Conv2D('conv_short', shortcut, ch_last, 1, strides=strides)
            shortcut = BatchNorm('bn_short', shortcut)
        l = l + shortcut
    return l
def Conv3D(
        inputs,
        filters,
        kernel_size,
        strides=(1, 1, 1),
        padding='same',
        data_format='channels_last',
        dilation_rate=(1, 1, 1),
        activation=None,
        use_bias=True,
        kernel_initializer=tf.contrib.layers.variance_scaling_initializer(2.0),
        bias_initializer=tf.zeros_initializer(),
        kernel_regularizer=None,
        bias_regularizer=None,
        activity_regularizer=None,
        split=1):
    """
    A wrapper around `tf.layers.Conv2D`.
    Some differences to maintain backward-compatibility:
    1. Default kernel initializer is variance_scaling_initializer(2.0).
    2. Default padding is 'same'.
    3. Support 'split' argument to do group conv.
    Variable Names:
    * ``W``: weights
    * ``b``: bias
    """
    if split == 1:
        with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
            layer = tf.layers.Conv3D(filters,
                                     kernel_size,
                                     strides=strides,
                                     padding=padding,
                                     data_format=data_format,
                                     dilation_rate=dilation_rate,
                                     activation=activation,
                                     use_bias=use_bias,
                                     kernel_initializer=kernel_initializer,
                                     bias_initializer=bias_initializer,
                                     kernel_regularizer=kernel_regularizer,
                                     bias_regularizer=bias_regularizer,
                                     activity_regularizer=activity_regularizer)
            ret = layer.apply(inputs, scope=tf.get_variable_scope())
            ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=layer.kernel)
        if use_bias:
            ret.variables.b = layer.bias

    else:
        # group conv implementation
        data_format = get_data_format(data_format, tfmode=False)
        in_shape = inputs.get_shape().as_list()
        channel_axis = 3 if data_format == 'NHWC' else 1
        in_channel = in_shape[channel_axis]
        assert in_channel is not None, "[Conv3D] Input cannot have unknown channel!"
        assert in_channel % split == 0

        assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
            "Not supported by group conv now!"

        out_channel = filters
        assert out_channel % split == 0
        assert dilation_rate == (1, 1) or get_tf_version_number(
        ) >= 1.5, 'TF>=1.5 required for group dilated conv'

        kernel_shape = shape2d(kernel_size)
        filter_shape = kernel_shape + [in_channel / split, out_channel]
        stride = shape4d(strides, data_format=data_format)

        kwargs = dict(data_format=data_format)
        if get_tf_version_number() >= 1.5:
            kwargs['dilations'] = shape4d(dilation_rate,
                                          data_format=data_format)

        W = tf.get_variable('W', filter_shape, initializer=kernel_initializer)

        if use_bias:
            b = tf.get_variable('b', [out_channel],
                                initializer=bias_initializer)

        inputs = tf.split(inputs, split, channel_axis)
        kernels = tf.split(W, split, 3)
        outputs = [
            tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs)
            for i, k in zip(inputs, kernels)
        ]
        conv = tf.concat(outputs, channel_axis)
        if activation is None:
            activation = tf.identity
        ret = activation(tf.nn.bias_add(conv, b, data_format=data_format)
                         if use_bias else conv,
                         name='output')

        ret.variables = VariableHolder(W=W)
        if use_bias:
            ret.variables.b = b
    return ret
Example #8
0
def BatchNorm3d(inputs,
                axis=None,
                training=None,
                momentum=0.9,
                epsilon=1e-5,
                center=True,
                scale=True,
                beta_initializer=tf.zeros_initializer(),
                gamma_initializer=tf.ones_initializer(),
                virtual_batch_size=None,
                data_format='channels_last',
                internal_update=False,
                sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:
    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.
    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics: either None or "nccl". By default (None), it uses statistics of the input tensor to normalize.
          When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
          and it then uses per-machine (multiple GPU) statistics to normalize.
          Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute
          global mean&variance. The result is the global mean&variance only if each tower has the same batch size.
          This option has no effect when not training.
          This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222
    Variable Names:
    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    Note:
        Combinations of ``training`` and ``ctx.is_training``:
        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    # in 3d conv, we have 5d dim [batch, c, d, h, w]
    # assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        elif ndims == 5:
            axis = 1 if data_format == 'NCHW' else 4
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_number()
    if not training and ctx.is_training:
        assert TF_version >= 1.4, \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable({
                'moving_mean': 'mean/EMA',
                'moving_variance': 'variance/EMA'
        }):
            tf_args = dict(axis=axis,
                           momentum=momentum,
                           epsilon=epsilon,
                           center=center,
                           scale=scale,
                           beta_initializer=beta_initializer,
                           gamma_initializer=gamma_initializer,
                           fused=True,
                           _reuse=tf.get_variable_scope().reuse)
            if TF_version >= 1.5:
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs,
                             training=training,
                             scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else (
            [0, 2, 3] if axis == 1 else [0, 1, 2])
        if ndims == 5:
            red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3]
        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]
        if ndims == 5 and axis == 1:
            new_shape = [1, num_chan, 1, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower:
                logger.warn(
                    "A TensorFlow bug will cause cross-GPU BatchNorm to fail. "
                    "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360"
                )

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '',
                                 tf.get_variable_scope().name)
            num_dev = ctx.total
            batch_mean = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
            batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean_square,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                  num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            # Proof-of-concept, not ready yet.
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                           tf.reshape(beta, new_shape),
                                           tf.reshape(gamma, new_shape),
                                           epsilon)
        else:
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta,
                                           gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean,
                                moving_var, momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Example #9
0
def Conv(inputs,
         filters,
         kernel_size,
         strides=(1, 1),
         padding='same',
         data_format='channels_last',
         dilation_rate=(1, 1),
         activation=None,
         use_bias=True,
         kernel_initializer=None,
         bias_initializer=tf.zeros_initializer(),
         kernel_regularizer=None,
         bias_regularizer=None,
         activity_regularizer=None,
         split=1,
         norm=False):
    """
    Similar to `tf.layers.Conv2D`, but with some differences:
    1. Default kernel initializer is variance_scaling_initializer(2.0).
    2. Default padding is 'same'.
    3. Support 'split' argument to do group convolution.
    Variable Names:
    * ``W``: weights
    * ``b``: bias
    """
    if kernel_initializer is None:
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(
                2.0)  # deprecated
        else:
            kernel_initializer = tf.keras.initializers.VarianceScaling(
                2.0, distribution='untruncated_normal')
    dilation_rate = shape2d(dilation_rate)

    if True:
        # group conv implementation
        data_format = get_data_format(data_format, keras_mode=False)
        in_shape = inputs.get_shape().as_list()
        channel_axis = 3 if data_format == 'NHWC' else 1
        in_channel = in_shape[channel_axis]
        assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!"
        assert in_channel % split == 0

        assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
            "Not supported by group conv or dilated conv!"

        out_channel = filters
        assert out_channel % split == 0
        assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (
            1, 5), 'TF>=1.5 required for dilated conv.'

        kernel_shape = shape2d(kernel_size)
        filter_shape = kernel_shape + [in_channel // split, out_channel]
        stride = shape4d(strides, data_format=data_format)

        kwargs = {"data_format": data_format}
        if get_tf_version_tuple() >= (1, 5):
            kwargs['dilations'] = shape4d(dilation_rate,
                                          data_format=data_format)

        # matching input dtype (ex. tf.float16) since the default dtype of variable if tf.float32
        inputs_dtype = inputs.dtype
        W = tf.get_variable('parseweigth',
                            filter_shape,
                            dtype=inputs_dtype,
                            initializer=kernel_initializer)
        if norm:
            use_bias = False
            W = tf.reshape(W, kernel_shape + [4, in_channel // 4, out_channel])
            W = tf.nn.softmax(W, 2)
            W = tf.reshape(W, filter_shape)
        #dynamics = tf.reduce_mean(inputs, 0)
        #dynamics = tf.transpose(dynamics, [1,2,0])
        #dynamics = tf.image.resize_images(dynamics, kernel_shape)
        #dynamics = tf.expand_dims(dynamics, -1)
        #W = W  +  0.001 * dynamics #tf.random_normal(shape = tf.shape(W), mean = 0.0, stddev = 0.012, dtype = tf.float32)

        #W = W *tf.random_uniform(shape=W.get_shape().as_list(), minval=0., maxval=2.)

        if use_bias:
            b = tf.get_variable('parsebias', [out_channel],
                                dtype=inputs_dtype,
                                initializer=bias_initializer)

        if split == 1:
            conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
        else:
            try:
                conv = tf.nn.conv2d(inputs, W, stride, padding.upper(),
                                    **kwargs)
            except ValueError:
                log_once(
                    "CUDNN group convolution support is only available with "
                    "https://github.com/tensorflow/tensorflow/pull/25818 . "
                    "Will fall back to a loop-based slow implementation instead!",
                    'warn')

        ret = tf.nn.bias_add(conv, b,
                             data_format=data_format) if use_bias else conv
        if activation is not None:
            ret = activation(ret)
        ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=W)
        if use_bias:
            ret.variables.b = b
    return ret
Example #10
0
def mpusim_depthwise_convolution2d(inputs,
                                   kernel_size,
                                   strides=(1, 1),
                                   padding='valid',
                                   depth_multiplier=1,
                                   data_format='channels_last',
                                   activation=None,
                                   use_bias=True,
                                   depthwise_initializer='glorot_uniform',
                                   bias_initializer='zeros',
                                   depthwise_regularizer=None,
                                   bias_regularizer=None,
                                   depthwise_constraint=None,
                                   bias_constraint=None,
                                   activations_datatype_size_byte=1,
                                   weights_datatype_size_byte=1,
                                   results_datatype_size_byte=4,
                                   systolic_array_height=256,
                                   systolic_array_width=256,
                                   activation_fifo_depth=8,
                                   accumulator_array_height=4096,
                                   log_file_output_dir='.',
                                   model_name='unnamed'):

    #depthwise_initializer = initializers.get(depthwise_initializer)
    #depthwise_regularizer = regularizers.get(depthwise_regularizer)
    #depthwise_constraint = constraints.get(depthwise_constraint)
    #bias_initializer = initializers.get(bias_initializer)

    data_format = get_data_format(data_format, keras_mode=False)
    input_shape = inputs.get_shape().as_list()

    strides = shape4d(strides, data_format=data_format)

    if len(input_shape) < 4:
        raise ValueError(
            'Inputs to `mpusim_depthwise_conv2d` should have rank 4. '
            'Received input shape:', str(input_shape))

    if data_format == 'NCHW':
        raise ValueError('mpusim_depthwise_convolution2d '
                         'only supports NHWC data format')
    else:
        channel_axis = 3

    if input_shape[channel_axis] is None:
        raise ValueError('The channel dimension of the inputs to '
                         '`mpusim_depthwise_convolution2d` '
                         'should be defined. Found `None`.')

    input_dim = int(input_shape[channel_axis])

    depthwise_kernel_shape = (kernel_size[0], kernel_size[1], input_dim,
                              depth_multiplier)

    depthwise_kernel = tf.get_variable('W',
                                       shape=depthwise_kernel_shape,
                                       initializer=depthwise_initializer,
                                       regularizer=depthwise_regularizer,
                                       constraint=depthwise_constraint)

    if use_bias:
        biases = tf.get_variable('b',
                                 shape=(input_dim * depth_multiplier, ),
                                 initializer=bias_initializer,
                                 regularizer=bias_regularizer,
                                 constraint=bias_constraint)

    result = mpusim_depthwise_conv2d(
        inputs,
        depthwise_kernel,
        strides=strides,
        padding=padding,
        data_format=data_format,
        activations_datatype_size_byte=activations_datatype_size_byte,
        weights_datatype_size_byte=weights_datatype_size_byte,
        results_datatype_size_byte=results_datatype_size_byte,
        systolic_array_height=systolic_array_height,
        systolic_array_width=systolic_array_width,
        activation_fifo_depth=activation_fifo_depth,
        accumulator_array_height=accumulator_array_height,
        log_file_output_dir=log_file_output_dir,
        model_name=model_name)

    if use_bias:
        result = tf.nn.bias_add(result, bias, data_format=data_format)

    if activation is not None:
        result = activation(result)

    result = tf.identity(result, name='output')

    result.variables = VariableHolder(W=depthwise_kernel)

    if use_bias:
        result.variables.b = biases

    return result
def MaskedConv2D(
        inputs,
        filters,
        kernel_size,
        strides=(1, 1),
        padding='same',
        data_format='channels_last',
        dilation_rate=(1, 1),
        activation=None,
        use_bias=True,
        kernel_initializer=None,
        bias_initializer=tf.zeros_initializer(),
        kernel_regularizer=None,
        bias_regularizer=None,
        activity_regularizer=None,
        split=1,
        masking=False):
    """
    A wrapper around `tf.layers.Conv2D`.
    Some differences to maintain backward-compatibility:

    1. Default kernel initializer is variance_scaling_initializer(2.0).
    2. Default padding is 'same'.
    3. Support 'split' argument to do group conv.

    Variable Names:

    * ``W``: weights
    * ``b``: bias
    """
    if kernel_initializer is None:
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
        else:
            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    dilation_rate = shape2d(dilation_rate)

    if (masking == False) and (split == 1) and (dilation_rate == [1, 1]):
        # tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797)
        with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
            layer = tf.layers.Conv2D(
                filters,
                kernel_size,
                strides=strides,
                padding=padding,
                data_format=data_format,
                dilation_rate=dilation_rate,
                activation=activation,
                use_bias=use_bias,
                kernel_initializer=kernel_initializer,
                bias_initializer=bias_initializer,
                kernel_regularizer=kernel_regularizer,
                bias_regularizer=bias_regularizer,
                activity_regularizer=activity_regularizer,
                _reuse=tf.get_variable_scope().reuse)
            ret = layer.apply(inputs, scope=tf.get_variable_scope())
            ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=layer.kernel)
        if use_bias:
            ret.variables.b = layer.bias

    else:
        if masking == True:
            assert split == 1, "Pruining group conv is not supported yet"

        # group conv implementation
        data_format = get_data_format(data_format, keras_mode=False)
        in_shape = inputs.get_shape().as_list()
        channel_axis = 3 if data_format == 'NHWC' else 1
        in_channel = in_shape[channel_axis]
        assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!"
        assert in_channel % split == 0

        assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
            "Not supported by group conv or dilated conv!"

        out_channel = filters
        assert out_channel % split == 0
        assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.'

        kernel_shape = shape2d(kernel_size)
        filter_shape = kernel_shape + [in_channel / split, out_channel]
        stride = shape4d(strides, data_format=data_format)

        kwargs = dict(data_format=data_format)
        if get_tf_version_tuple() >= (1, 5):
            kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format)

        W = tf.get_variable(
            'W', filter_shape, initializer=kernel_initializer)

        if use_bias:
            b = tf.get_variable('b', [out_channel], initializer=bias_initializer)

        if split == 1:
            if masking:
                W = pruning.apply_mask(W)
            conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
        else:
            conv = None
            if get_tf_version_tuple() >= (1, 13):
                try:
                    conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
                except ValueError:
                    log_once("CUDNN group convolution support is only available with "
                             "https://github.com/tensorflow/tensorflow/pull/25818 . "
                             "Will fall back to a loop-based slow implementation instead!", 'warn')
            if conv is None:
                inputs = tf.split(inputs, split, channel_axis)
                kernels = tf.split(W, split, 3)
                outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs)
                           for i, k in zip(inputs, kernels)]
                conv = tf.concat(outputs, channel_axis)

        ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv
        if activation is not None:
            ret = activation(ret)
        ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=W)
        if use_bias:
            ret.variables.b = b
    return ret
Example #12
0
def mpusim_conv2d(
        inputs,
        filters,
        kernel_size,
        strides=(1, 1),
        padding='same',
        data_format='channels_last',
        dilation_rate=(1, 1),
        activation=None,
        use_bias=True,
        kernel_initializer=None,
        bias_initializer=tf.zeros_initializer(),
        kernel_regularizer=None,
        bias_regularizer=None,
        activity_regularizer=None,
        split=1,
        activations_datatype_size_byte=1,
        weights_datatype_size_byte=1,
        results_datatype_size_byte=4,
        systolic_array_height=256,
        systolic_array_width=256,
        activation_fifo_depth=8,
        accumulator_array_height=4096,
        log_file_output_dir='.',
        model_name='unnamed'):
    """
    Similar to `tf.layers.Conv2D`, but with some differences:

    1. Default kernel initializer is variance_scaling_initializer(2.0).
    2. Default padding is 'same'.
    3. Support 'split' argument to do group convolution.

    Variable Names:

    * ``W``: weights
    * ``b``: bias
    """
    if kernel_initializer is None:
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
        else:
            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    dilation_rate = shape2d(dilation_rate)

    # group conv implementation
    data_format = get_data_format(data_format, keras_mode=False)
    in_shape = inputs.get_shape().as_list()
    channel_axis = 3 if data_format == 'NHWC' else 1
    in_channel = in_shape[channel_axis]
    assert in_channel is not None, "[mpusim_conv2d] Input cannot have unknown channel!"
    assert in_channel % split == 0

    assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
        "Not supported by group conv or dilated conv!"

    out_channel = filters
    assert out_channel % split == 0
    assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.'

    kernel_shape = shape2d(kernel_size)
    filter_shape = kernel_shape + [in_channel / split, out_channel]
    stride = shape4d(strides, data_format=data_format)

    kwargs = dict(data_format=data_format)
    if get_tf_version_tuple() >= (1, 5):
        kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format)

    W = tf.get_variable(
            'W', filter_shape, initializer=kernel_initializer)

    if use_bias:
        b = tf.get_variable('b', [out_channel], initializer=bias_initializer)

    if split == 1:
        conv = mpu_sim_conv2d_lib.mpu_sim_conv2d(inputs,
                                                    W,
                                                    activations_datatype_size_byte,
                                                    weights_datatype_size_byte,
                                                    results_datatype_size_byte,
                                                    systolic_array_height,
                                                    systolic_array_width,
                                                    activation_fifo_depth,
                                                    accumulator_array_height,
                                                    log_file_output_dir,
                                                    model_name,
                                                    stride,
                                                    padding.upper(),
                                                    **kwargs)
    else:
        
        inputs = tf.split(inputs, split, channel_axis)
        kernels = tf.split(W, split, 3)
        outputs = [mpu_sim_conv2d_lib.mpu_sim_conv2d(input_block,
                                                        kernel_block,
                                                        activations_datatype_size_byte,
                                                        weights_datatype_size_byte,
                                                        results_datatype_size_byte,
                                                        systolic_array_height,
                                                        systolic_array_width,
                                                        activation_fifo_depth,
                                                        accumulator_array_height,
                                                        log_file_output_dir,
                                                        model_name,
                                                        stride,
                                                        padding.upper(),
                                                        **kwargs)
                    for input_block, kernel_block in zip(inputs, kernels)]
        conv = tf.concat(outputs, channel_axis)

    ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv
    if activation is not None:
        ret = activation(ret)
    ret = tf.identity(ret, name='output')

    ret.variables = VariableHolder(W=W)
    if use_bias:
        ret.variables.b=b
    return ret
Example #13
0
def _data_format_to_ch_dim(data_format):
    data_format = get_data_format(data_format, keras_mode=False)
    ch_dim = -1 if data_format == 'NHWC' else 1
    return ch_dim
Example #14
0
def BatchNorm3d(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):


    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        elif ndims == 5:
            axis = 1 if data_format == 'NCHW' else 4
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    if not training and ctx.is_training:
        assert TF_version >= 1.4
        if ctx.is_main_training_tower: 
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable(
                {'moving_mean': 'mean/EMA',
                 'moving_variance': 'variance/EMA'}):
            tf_args = dict(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=True,
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= 1.5:
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

        # maintain EMA only on one GPU
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  #backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  #backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2])
        if ndims == 5:
            red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3]
        new_shape = None 
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]
        if ndims == 5 and axis == 1:
            new_shape = [1, num_chan, 1, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower:
                logger.warn("A TensorFlow bug cusing cross-GPU BatchNorm to fail")

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name)
            num_dev = ctx.total
            batch_mean = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
            batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean_square,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
        else:
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                beta, gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(
                xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var,
                momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  
            moving_variance=moving_var,
            variance=moving_var)  
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret