예제 #1
0
def conv1d(x, scope, nf, std=0.02, relu=False, fast_gelu=False):
    with tf.variable_scope(scope):
        nx    = x.shape[-1].value
        ndims = x.shape.ndims

        # Note: param initializers are not particularly well tuned in this code
        # w = tf.get_variable("w", [nx, nf], initializer=tf.random_normal_initializer(stddev=std), dtype=tf.float32)
        w = tf.get_variable("w", [nx, nf], initializer=tf.random_uniform_initializer(minval=-std, maxval=std), dtype=tf.float32)
        b = tf.get_variable("b", [    nf], initializer=tf.constant_initializer(0.0), dtype=tf.float32)

        # if hps.float16:
        #     # We delay weight casting till just before use to minimize memory footprint.
        #     # In recompute mode these casts are released just after use on forward pass,
        #     # then remade on the recompute pass.
        #     with tf.control_dependencies([x.op]):
        #         # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass.
        #         # Our all-reduce and fused optimizers can accept fp16 natively.
        #         w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16)

        # merge context and batch dims for more efficient matmul
        if ndims > 2:
            y_shape = tf.concat([x.shape[: ndims - 1], [nf]], axis=0)
            x = tf.reshape(x, [-1, nx])

        y = tf.matmul(x, w)

        # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce
        y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False)

        if ndims > 2:
            y = tf.reshape(y, y_shape)

        return y
    def conv1d(self, x, scope, channel, std=0.02, relu=False, fast_gelu=False, bias=True):
        with tf.variable_scope(scope):
            nx = x.shape[-1].value
            ndims = x.shape.ndims

            # Note: param initializers are not particularly well tuned in this code
            w = tf.get_variable("w", [nx, channel], initializer=create_initializer(initializer_range=std),
                                dtype=tf.float16 if self.config.float16 else tf.float32)
            if bias:
                b = tf.get_variable("bias", [channel], initializer=tf.constant_initializer(0.0))
            else:
                b = 0

            # merge context and batch dims for more efficient matmul
            if ndims > 2:
                y_shape = tf.concat([tf.shape(x)[: ndims - 1], [channel]], axis=0)
                x = tf.reshape(x, [-1, nx])

            y = tf.matmul(x, w)

            # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce
            y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False)

            if ndims > 2:
                y = tf.reshape(y, y_shape)

            return y
예제 #3
0
    def dense(x,
              hidden_size,
              activation=None,
              name='dense',
              kernel_initializer=None,
              bias=True):
        if kernel_initializer is None:
            kernel_initializer = create_initializer(0.02)
        with tf.variable_scope(name):
            nx = x.shape[-1].value
            ndims = x.shape.ndims
            dtype = x.dtype

            # Note: param initializers are not particularly well tuned in this code
            w = tf.get_variable("kernel", [nx, hidden_size],
                                initializer=kernel_initializer,
                                dtype=dtype)

            assert x.op.device != ''

            if bias:
                b = tf.get_variable("bias", [hidden_size],
                                    initializer=tf.zeros_initializer)
            else:
                b = 0

            # merge context and batch dims for more efficient matmul
            if ndims > 2:
                y_shape = tf.concat([tf.shape(x)[:ndims - 1], [hidden_size]],
                                    axis=0)
                x = tf.reshape(x, [-1, nx])

            y = tf.matmul(x, w)

            if activation == 'fast_gelu' or activation == 'gelu':
                fast_gelu = True
            else:
                fast_gelu = False
            if activation == 'relu':
                relu = True
            else:
                relu = False
            y = bs.bias_relu(y,
                             b,
                             relu=relu,
                             fast_gelu=fast_gelu,
                             atomics=False)

            if activation == 'tanh':
                y = tf.tanh(y)
            elif activation == 'sigmoid':
                y = tf.sigmoid(y)

            if ndims > 2:
                y = tf.reshape(y, y_shape)

            return y
예제 #4
0
def conv1d(x,
           scope,
           channel,
           std=0.02,
           relu=False,
           fast_gelu=False,
           bias=True,
           float16=False):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        ndims = x.shape.ndims

        # Note: param initializers are not particularly well tuned in this code
        w = tf.get_variable(
            "w", [nx, channel],
            initializer=create_initializer(initializer_range=std))
        if bias:
            b = tf.get_variable("bias", [channel],
                                initializer=tf.constant_initializer(0.0))
        else:
            b = tf.zeros([channel])

        if float16:
            # We delay weight casting till just before use to minimize memory footprint.
            # In recompute mode these casts are released just after use on forward pass,
            # then remade on the recompute pass.
            with tf.control_dependencies([x.op]):
                # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass.
                # Our all-reduce and fused optimizers can accept fp16 natively.
                w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16)

        # merge context and batch dims for more efficient matmul
        if ndims > 2:
            y_shape = tf.concat([tf.shape(x)[:ndims - 1], [channel]], axis=0)
            x = tf.reshape(x, [-1, nx])

        y = tf.matmul(x, w)

        # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce
        y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False)

        if ndims > 2:
            y = tf.reshape(y, y_shape)

        return y