Beispiel #1
0
    def testTranspose(self):
        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            for shape in shapes:

                cpuX = np.random.uniform(-1.0, 1.0, shape).astype(
                    np.float16).astype(np.float32)
                x = tf.placeholder(tf.float32, shape, name="x")

                for dtype in (tf.float16, tf.float32):  #tf.float16, tf.float32

                    xf = bs.float_cast(x, dtype=dtype)

                    y = bs.transpose_2d(xf)
                    y = bs.float_cast(y, dtype=tf.float32)

                    Y = tf.transpose(xf)
                    Y = bs.float_cast(Y, dtype=tf.float32)

                    y, Y = sess.run([y, Y], feed_dict={x: cpuX})

                    dif = np.abs(Y - y)
                    avgval = np.average(abs(Y))
                    maxdif = dif.max()
                    max_err = maxdif if avgval == 0 else maxdif / avgval
                    l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt(
                        np.square(Y).sum())

                    print("%s, shape:%16s, err:%17.12f, l2_err:%17.12f" %
                          (dtype.name, str(shape), max_err, l2_err))
    def atestGateGrad(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            dtype = tf.float16

            layout = np.ones([2, 2], dtype=np.bool)
            bsmm = bs.BlocksparseMatMul(layout,
                                        block_size=8,
                                        feature_axis=0,
                                        name="test")

            X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(64)).astype(
                np.float16).astype(np.float32)
            W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(
                np.float16).astype(np.float32)
            G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(
                np.float16).astype(np.float32)
            #G = np.ones([bsmm.blocks], dtype=np.float32)

            x = tf.constant(X)
            w = tf.constant(W)
            g = tf.constant(G)

            wf = bs.float_cast(w, dtype=dtype)
            xf = bs.float_cast(x, dtype=dtype)

            y = bsmm(xf, wf, gate=g, gate_grad=True, bench=0)

            y = bs.float_cast(y, dtype=tf.float32)

            sess.run(tf.global_variables_initializer())

            # y = sess.run( y )
            # exit()

            error = gradient_checker.compute_gradient_error(
                x, x.shape, y,
                y.shape)  #, extra_feed_dict={ x: cpuX, m: mask }
            print(error)

            error = gradient_checker.compute_gradient_error(
                w, w.shape, y,
                y.shape)  #, extra_feed_dict={ x: cpuX, m: mask }
            print(error)

            error = gradient_checker.compute_gradient_error(
                g, g.shape, y,
                y.shape)  #, extra_feed_dict={ x: cpuX, m: mask }
            print(error)
    def testSoftmaxCrossEntropy(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            N = 3 # 80 * 16
            for K in (10, 256, 512, 1024*8, 1024*16, 1024*32, 1024*64,): #10, 256, 512, 1024*8, 1024*16, 1024*32, 1024*64

                np.random.seed(int(time()))
                #cpuX = np.random.uniform(-20.0, 20.0, (N, K)).astype(np.float16).astype(np.float32) #65504
                cpuX = np.random.normal(0.0, 1.0, (N, K)).astype(np.float16).astype(np.float32)
                cpuE = np.random.normal(0.0, 1.0, (N,  )).astype(np.float16).astype(np.float32)
                cpuI = np.random.randint(0, K, size=(N,  ),  dtype=np.uint16)

                x = tf.placeholder(tf.float32, cpuX.shape)
                e = tf.placeholder(tf.float32, cpuE.shape)
                i = tf.placeholder(tf.uint16,  cpuI.shape)
                feed_dict = { x: cpuX, i: cpuI, e: cpuE }

                xf = bs.float_cast(x, dtype=tf.float16)
                y = bs.softmax_cross_entropy(logits=xf, labels=i)

                Y = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=tf.cast(i, tf.int32))

                y, (dx,) = sess.run( [ y, tf.gradients(y, [x], e) ], feed_dict )
                Y, (DX,) = sess.run( [ Y, tf.gradients(Y, [x], e) ], feed_dict )

                print("testSoftmaxCrossEntropy", K)

                if not bench:
                    for op, dev, cpu in [
                        [  "Y",  y,  Y ],
                        [ "DX", dx, DX ],
                    ]:
                        self.compare_results(op, dev, cpu)
Beispiel #4
0
def conv1d(x, scope, nf, std=0.02, relu=False, fast_gelu=False):
    with tf.variable_scope(scope):
        nx = x.shape[-1].value
        ndims = x.shape.ndims

        # Note: param initializers are not particularly well tuned in this code
        w = tf.get_variable(
            "w", [nx, nf],
            initializer=tf.random_normal_initializer(stddev=std))
        b = tf.get_variable("b", [nf],
                            initializer=tf.constant_initializer(0.0))

        if hps.float16:
            # We delay weight casting till just before use to minimize memory footprint.
            # In recompute mode these casts are released just after use on forward pass,
            # then remade on the recompute pass.
            with tf.control_dependencies([x.op]):
                # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass.
                # Our all-reduce and fused optimizers can accept fp16 natively.
                w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16)

        # merge context and batch dims for more efficient matmul
        if ndims > 2:
            y_shape = tf.concat([tf.shape(x)[:ndims - 1], [nf]], axis=0)
            x = tf.reshape(x, [-1, nx])

        y = tf.matmul(x, w)

        # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce
        y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False)

        if ndims > 2:
            y = tf.reshape(y, y_shape)

        return y
    def testBlocksparseTransformerDense(self):
        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            batch = 2
            heads = 2
            state = 64*2
            scale = 1.0 / np.sqrt(state/heads)

            for bsize in (8, 16, 32, 64):

                ctxQ = 16
                ctxK = 16

                layout = np.ones([heads, ctxQ, ctxK], dtype=np.bool)
                bst = bs.BlocksparseTransformer(layout, block_size=bsize)

                shapeQ = (batch, ctxQ*bsize, heads*state)
                shapeK = (batch, ctxK*bsize, heads*state)

                if ones:
                    cpuQ = np.ones(shapeQ, dtype=np.float32)
                    cpuK = np.ones(shapeK, dtype=np.float32)
                    cpuV = np.ones(shapeK, dtype=np.float32)
                    cpuE = np.ones(shapeQ, dtype=np.float32)
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32)
                    cpuV = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, shapeQ)
                k = tf.placeholder(tf.float32, shapeK)
                v = tf.placeholder(tf.float32, shapeK)
                e = tf.placeholder(tf.float32, shapeQ)

                feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE }

                qf = bs.float_cast(q, dtype=tf.float16)
                kf = bs.float_cast(k, dtype=tf.float16)
                vf = bs.float_cast(v, dtype=tf.float16)

                w = bst.query_key_op(qf, kf, bench=bench)
                w = bst.softmax(w, scale=scale)
                y = bst.weight_value_op(w, vf, bench=bench)

                qf = bs.transpose_0213(tf.reshape(qf, [batch, ctxQ*bsize, heads, state]))
                kf = bs.transpose_0213(tf.reshape(kf, [batch, ctxK*bsize, heads, state]))
                vf = bs.transpose_0213(tf.reshape(vf, [batch, ctxK*bsize, heads, state]))
                W = tf.matmul(qf, kf, transpose_b=True)
                W = bs.softmax(W, scale=scale)
                Y = tf.matmul(W, vf)
                Y = tf.reshape(bs.transpose_0213(Y), [batch, ctxQ*bsize, heads*state])

                y = bs.float_cast(y, dtype=tf.float32)
                Y = bs.float_cast(Y, dtype=tf.float32)

                y, (dq, dk, dv) = sess.run( [ y, tf.gradients(y, [q, k, v], e) ], feed_dict )
                Y, (DQ, DK, DV) = sess.run( [ Y, tf.gradients(Y, [q, k, v], e) ], feed_dict )

                print("testBlocksparseTransformerDense", bsize)
                if not bench:
                    for op, dev, cpu in [
                        [ " Y",  y,  Y ],
                        [ "DV", dv, DV ],
                        [ "DK", dk, DK ],
                        [ "DQ", dq, DQ ],
                    ]:
                        self.compare_results(op, dev, cpu)
    def atestBlocksparseSoftmax(self):

        batch = 1
        heads = 1
        key   = 7

        def checker_callback(blk_shape, head_idx, qry_idx, key_idx, blk_idx):
            mask = np.ones(blk_shape, dtype=np.bool)
            mask[::2,1::2] = False
            mask[1::2,::2] = False
            return mask

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            # for ctx in (16, 32, 64, 128, 256, 512, 1024, 2048, 4096): #16, 32, 64, 128, 256, 512, 1024, 2048, 4096
            #     for bsize in (8, 16, 32, 64,): # 8, 16, 32, 64,
            #         if bsize * (ctx+0) <= 32768:

            for ctx in (16,): #16, 32, 64, 128, 256, 512, 1024, 2048, 4096
                for bsize in (8, 16, 32, 64, ): # 8, 16, 32, 64,
                    if bsize * (ctx) <= 32768:

                        # define outer block structure for blocksparse matmul
                        layout = np.ones([heads, ctx, ctx], dtype=np.bool)

                        bst = bs.BlocksparseTransformer(layout, heads=heads, block_size=bsize, mask_callback=checker_callback) # checker_callback

                        shape = (batch, heads, bst.blocks, bsize, bsize)
                        print(shape)

                        if ones:
                            cpuX = np.ones(shape, dtype=np.float32)
                            cpuE = np.ones(shape, dtype=np.float32)

                        else:
                            cpuX = np.random.normal(0.0, 1.0, shape).astype(np.float16).astype(np.float32)
                            cpuE = np.random.normal(0.0, 1.0, shape).astype(np.float16).astype(np.float32)

                        # np.savetxt("cpuX.txt", cpuX.reshape((-1,bsize)), fmt='%5.2f')

                        # for i, a in enumerate(np.max(cpuX.reshape(-1,bsize), axis=1)):
                        #     print("%2d %.2f" % (i, a))
                        # print()

                        x = tf.placeholder(tf.float32, cpuX.shape)
                        e = tf.placeholder(tf.float32, cpuE.shape)
                        feed_dict = { x: cpuX, e: cpuE }

                        xf = bs.float_cast(x, dtype=tf.bfloat16)

                        y = bst.masked_softmax(xf, scale=0.5, autoregress_at_key=key)

                        y = bs.float_cast(y, dtype=tf.float32)

                        dx, = tf.gradients(y, [ x ], e)

                        y, dx = sess.run( [ y, dx ], feed_dict )

                        Y  = bst.masked_softmax_test(cpuX, scale=0.5, autoregress_at_key=key)
                        DX = bst.masked_softmax_grad_test(cpuE, Y, scale=0.5)

                        print("testBlocksparseSoftmax", ctx*bsize, bsize)
                        for op, dev, cpu in [
                            [  "Y",  y,  Y ],
                            [ "DX", dx, DX ],
                        ]:
                            self.compare_results(op, dev, cpu)
    def testBlocksparseTransformerMatmul(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for bsize in ( 32, ): # 8, 16, 32, 64

                dtype_qk = tf.float32
                dtype_w  = tf.bfloat16
                ones  = 0
                bench = 0
                batch = 2
                heads = 4
                ctx   = 16
                state = 64*2
                scale = 1.0 # / np.sqrt(state/heads)

                ctxQ = ctx
                ctxK = ctx # *2

                layout = np.ones([1, ctxQ, ctxK], dtype=np.bool)
                for q, k in np.ndindex(ctx, ctx):
                    if k > q:
                        layout[:,q,k] = 0
                #layout[:,0,:] = 1
                bst = bs.BlocksparseTransformer(layout, heads=heads, block_size=bsize, mask_callback=mask_callback)

                q_shape = (batch, ctxQ*bsize, heads*state)
                k_shape = (batch, ctxK*bsize, heads*state)
                w_shape = (batch, heads, bst.blocks, bsize, bsize)

                if ones:
                    cpuQ = np.ones(q_shape, dtype=np.float32)
                    cpuK = np.ones(k_shape, dtype=np.float32)
                    cpuW = np.ones(w_shape, dtype=np.float32)
                    # cpuQ[0,:,:] = np.eye(bsize, dtype=np.float32)
                    # cpuK[0,:,:] = np.eye(bsize, dtype=np.float32)
                    # cpuW[0,0,0,:,:] = np.eye(bsize, dtype=np.float32)
                    # cpuQ[0,0,0,:] = 1
                    # cpuK[0,0,0,:] = range(64)
                    # cpuW[0,0,0,0,:] = 1
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, q_shape).astype(np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, k_shape).astype(np.float16).astype(np.float32)
                    cpuW = np.random.uniform(-1.0, 1.0, w_shape).astype(np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, cpuQ.shape)
                k = tf.placeholder(tf.float32, cpuK.shape)
                w = tf.placeholder(tf.float32, cpuW.shape)

                feed_dict = { q: cpuQ, k: cpuK, w: cpuW }

                qf = bs.float_cast(q, dtype=dtype_qk)
                kf = bs.float_cast(k, dtype=dtype_qk)
                wf = bs.float_cast(w, dtype=dtype_w)

                nt = bst.nt_op(qf, kf, bench=bench)
                nn = bst.nn_op(wf, kf, bench=bench)
                tn = bst.tn_op(wf, qf, bench=bench)

                nt = bs.float_cast(nt, dtype=tf.float32)
                nn = bs.float_cast(nn, dtype=tf.float32)
                tn = bs.float_cast(tn, dtype=tf.float32)

                print("testBlocksparseTransformerMatmul", bsize)

                nt, nn, tn = sess.run( [ nt, nn, tn ], feed_dict ) # nt, nn, tn

                if not bench:

                    NT = bst.nt_test(cpuQ, cpuK)
                    NN = bst.nn_test(cpuW, cpuK)
                    TN = bst.tn_test(cpuW, cpuQ)

                    for op, dev, cpu in [
                        [ "NT", nt, NT ],
                        [ "NN", nn, NN ],
                        [ "TN", tn, TN ],
                    ]:
                        self.compare_results(op, dev, cpu)
    def testBlocksparseTransformerSparse(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            batch = 2
            heads = 2
            ctx   = 16
            state = 64*2
            scale = 1.0 / np.sqrt(state/heads)
            dtype = tf.float32

            for bsize in ( 32, ): # 8, 16, 32, 64

                layout = np.ones([heads, ctx, ctx], dtype=np.bool)
                for q, k in np.ndindex(ctx, ctx):
                    if k > q:
                        layout[:,q,k] = 0
                bst = bs.BlocksparseTransformer(layout, block_size=bsize, mask_callback=mask_callback)

                shape = (batch, ctx*bsize, heads*state)

                if ones:
                    cpuQ = np.ones(shape, dtype=np.float32)
                    cpuK = np.ones(shape, dtype=np.float32)
                    cpuV = np.ones(shape, dtype=np.float32)
                    cpuE = np.ones(shape, dtype=np.float32)
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)
                    cpuV = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, shape)
                k = tf.placeholder(tf.float32, shape)
                v = tf.placeholder(tf.float32, shape)
                e = tf.placeholder(tf.float32, shape)

                feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE }

                qf = bs.float_cast(q, dtype=dtype)
                kf = bs.float_cast(k, dtype=dtype)
                vf = bs.float_cast(v, dtype=dtype)

                w = bst.query_key_op(qf, kf)
                a = bst.masked_softmax(w, scale=scale)
                y = bst.weight_value_op(a, vf)

                w = bs.float_cast(w, dtype=tf.float32)
                a = bs.float_cast(a, dtype=tf.float32)
                y = bs.float_cast(y, dtype=tf.float32)

                dq, dk, dv = tf.gradients(y, [q, k, v], e)
                w, a, y, dq, dk, dv = sess.run( [ w, a, y, dq, dk, dv ], feed_dict )

                W = bst.nt_test(cpuQ, cpuK)
                A = bst.masked_softmax_test(W, scale=scale)
                Y = bst.nn_test(A, cpuV)

                DV = bst.tn_test(   A, cpuE)
                DW = bst.nt_test(cpuE, cpuV)

                DW = bst.masked_softmax_grad_test(DW, A, scale=scale)

                DQ = bst.nn_test(  DW, cpuK)
                DK = bst.tn_test(  DW, cpuQ)

                print("testBlocksparseTransformerSparse", 32)
                if not bench:
                    for op, dev, cpu in [
                        [  "W",  w,  W ],
                        [  "A",  a,  A ],
                        [  "Y",  y,  Y ],
                        [ "DV", dv, DV ],
                        [ "DK", dk, DK ],
                        [ "DQ", dq, DQ ],
                    ]:
                        self.compare_results(op, dev, cpu)
Beispiel #9
0
    def testAdafactor(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for dtype in (tf.float32, tf.float16):  # tf.float16
                for shape in (
                    (1, ),
                    (3, ),
                    (127),
                    (1, 1024),
                    (1023, 1024),
                    (1024, 1024),
                ):
                    if ones:
                        G = np.ones(shape, dtype=np.float32)
                        P = np.ones(shape, dtype=np.float32)
                        M = np.zeros(shape, dtype=np.float32)
                        V = np.zeros(shape, dtype=np.float32)
                    else:
                        G = np.random.uniform(-1.0, 1.0, shape).astype(
                            np.float16).astype(np.float32)
                        P = np.random.uniform(-1.0, 1.0, shape).astype(
                            np.float16).astype(np.float32)
                        M = np.random.uniform(0.0, 1.0, shape).astype(
                            np.float16).astype(np.float32)
                        V = np.random.uniform(0.0, 1.0, shape).astype(
                            np.float16).astype(np.float32)

                    g = tf.placeholder(tf.float32, G.shape)
                    p = tf.Variable(initial_value=P, name="p")
                    m = tf.Variable(initial_value=M, name="m")
                    v = tf.Variable(initial_value=V, name="v")
                    sess.run(tf.global_variables_initializer())

                    g = bs.float_cast(g, dtype=dtype)

                    global_norm, norm_scale = bs.clip_by_global_norm(
                        [g], grad_scale=grad_scale, clip_norm=clip_norm)

                    p, m, v = sess.run(adam_op(g,
                                               p,
                                               m,
                                               v,
                                               learn_rate,
                                               grad_scale,
                                               clip_sigma, [norm_scale], [],
                                               decay_mean=beta1,
                                               decay_var=beta2,
                                               epsilon=epsilon),
                                       feed_dict={g: G})

                    GN = np.sqrt(
                        np.sum(np.square(G * grad_scale), keepdims=True))
                    NS = clip_norm / np.maximum(GN, clip_norm)
                    G *= NS * grad_scale

                    M = beta1 * M + (1.0 - beta1) * G
                    V = beta2 * V + (1.0 - beta2) * G * G

                    P -= learn_rate * M / (np.sqrt(V) + epsilon)

                    print("testAdam", dtype, GN, NS)
                    for op, dev, cpu in [
                        ["M", m, M],
                        ["V", v, V],
                        ["P", p, P],
                    ]:
                        self.compare_results(op, dev, cpu)
Beispiel #10
0
    def testBlocksparseReducedDW(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            ones     = 0
            norm     = 0
            accum    = 0
            blocks_x = 2
            blocks_y = 4
            bsize    = 32
            axis     = 0
            depth    = 8
            N        = 64
            scale    = 1.0 / (N * depth)
            shape_x  = [N, N]
            shape_y  = [N, N]
            shape_w  = (blocks_x, blocks_y)
            shape_x[axis] = bsize * blocks_x
            shape_y[axis] = bsize * blocks_y



            XS = list()
            YS = list()
            if ones:
                for i in range(depth):
                    XS.append(np.ones(shape_x, dtype=np.float32))
                    YS.append(np.ones(shape_y, dtype=np.float32))
                    if accum:
                        DWA = np.ones(shape_w, dtype=np.float32)

                    XS[0][:] += np.arange(64, dtype=np.float32).reshape(1,64)
            else:
                for i in range(depth):
                    XS.append(np.random.normal(0.0, 1.0, shape_x).astype(np.float16).astype(np.float32))
                    YS.append(np.random.normal(0.0, 1.0, shape_y).astype(np.float16).astype(np.float32))
                    if accum:
                        DWA = np.random.normal(0.0, 1.0, shape_w).astype(np.float32)

            feed_dict = dict()
            xs = list()
            ys = list()
            for i in range(depth):
                x = tf.placeholder(tf.float32, shape_x, name=f"x{i}")
                y = tf.placeholder(tf.float32, shape_y, name=f"y{i}")
                feed_dict[x] = XS[i]
                feed_dict[y] = YS[i]
                xs.append(bs.float_cast(x, dtype=tf.float16))
                ys.append(bs.float_cast(y, dtype=tf.float16))

            if accum:
                dwa = tf.placeholder(tf.float32, DWA.shape, name=f"dwa")
                feed_dict[dwa] = DWA
                #dwa = bs.float_cast(dwa, dtype=tf.float16)
                dw, x_red, y_red = blocksparse_reduced_dw(xs, ys, scale, [dwa], bsize=bsize, norm=norm, axis=axis)
            else:
                dw, x_red, y_red = blocksparse_reduced_dw(xs, ys, scale, [   ], bsize=bsize, norm=norm, axis=axis)

            #dw    = bs.float_cast(dw,    dtype=tf.float32)
            x_red = bs.float_cast(x_red, dtype=tf.float32)
            y_red = bs.float_cast(y_red, dtype=tf.float32)

            dw, x_red, y_red = sess.run([dw, x_red, y_red], feed_dict=feed_dict)

            if axis == 0:
                X_RED = np.zeros([blocks_x, depth, N], dtype=np.float32)
                Y_RED = np.zeros([blocks_y, depth, N], dtype=np.float32)

                for i in range(depth):
                    X = XS[i].reshape([blocks_x, bsize, N])
                    Y = YS[i].reshape([blocks_y, bsize, N])
                    if norm == 0:
                        X_RED[:,i,:] = np.max(np.abs(X), axis=1)
                        Y_RED[:,i,:] = np.max(np.abs(Y), axis=1)
                    else:
                        X_RED[:,i,:] = np.sqrt(np.sum(np.square(X), axis=1))
                        Y_RED[:,i,:] = np.sqrt(np.sum(np.square(Y), axis=1))

                DW = np.dot(X_RED.reshape(blocks_x, -1), Y_RED.reshape(blocks_y, -1).T) * scale

            else:
                X_RED = np.zeros([depth, N, blocks_x], dtype=np.float32)
                Y_RED = np.zeros([depth, N, blocks_y], dtype=np.float32)

                for i in range(depth):
                    X = XS[i].reshape([N, blocks_x, bsize])
                    Y = YS[i].reshape([N, blocks_y, bsize])
                    if norm == 0:
                        X_RED[i,:,:] = np.max(np.abs(X), axis=2)
                        Y_RED[i,:,:] = np.max(np.abs(Y), axis=2)
                    else:
                        X_RED[i,:,:] = np.sqrt(np.sum(np.square(X), axis=2))
                        Y_RED[i,:,:] = np.sqrt(np.sum(np.square(Y), axis=2))

                DW = np.dot(X_RED.reshape(-1, blocks_x).T, Y_RED.reshape(-1, blocks_y)) * scale

            if accum:
                DW += DWA

            print("BlocksparseReducedDW", norm, bsize, depth)
            for op, dev, cpu in [
                [ "xr", x_red, X_RED ],
                [ "yr", y_red, Y_RED ],
                [ "dw",    dw,    DW ],
            ]:
                #print(op, dev.shape, cpu.shape)
                self.compare_results(op, dev, cpu)
Beispiel #11
0
    def testAdafactor(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for dtype in (tf.float32, tf.float16):  # tf.float16
                for shape_g in (
                    (1024, 1024 * 2),
                    (1, 1024 * 2),
                    (1024, 1023 * 1),
                    (1, 1023 * 1),
                ):

                    shape_c = (1, shape_g[1])
                    shape_r = (shape_g[0], 1)

                    if ones:
                        G = np.ones(shape_g, dtype=np.float32)
                        P = np.ones(shape_g, dtype=np.float32)
                        C = np.zeros(shape_c, dtype=np.float32)
                        R = np.zeros(shape_r, dtype=np.float32)
                    else:
                        G = np.random.uniform(-1.0, 1.0, shape_g).astype(
                            np.float16).astype(np.float32)
                        P = np.random.uniform(-1.0, 1.0, shape_g).astype(
                            np.float16).astype(np.float32)
                        C = np.random.uniform(0.0, 1.0, shape_c).astype(
                            np.float16).astype(np.float32)
                        R = np.random.uniform(0.0, 1.0, shape_r).astype(
                            np.float16).astype(np.float32)

                    g = tf.placeholder(tf.float32, G.shape)
                    p = tf.Variable(initial_value=P, name="p")
                    c = tf.Variable(initial_value=C, name="c")
                    r = tf.Variable(initial_value=R, name="r")
                    sess.run(tf.global_variables_initializer())

                    g = bs.float_cast(g, dtype=dtype)

                    # adafactor has it's own fused infinity filtering but quick test of this standalone op here.
                    g = bs.filter_tensor(g)

                    global_norm, norm_scale = bs.clip_by_global_norm(
                        [g], grad_scale=grad_scale, clip_norm=clip_norm)

                    if shape_g[0] > 1:

                        p, c, r, x, _ = sess.run(adafactor2d_op(
                            p,
                            c,
                            r,
                            g,
                            beta2,
                            learn_rate,
                            grad_scale,
                            clip_thresh, [norm_scale],
                            epsilon=epsilon),
                                                 feed_dict={g: G})

                        GN = np.sqrt(
                            np.sum(np.square(G * grad_scale), keepdims=True))
                        NS = clip_norm / np.maximum(GN, clip_norm)
                        G *= NS * grad_scale

                        C = beta2 * C + (1.0 - beta2) * np.mean(
                            np.square(G) + epsilon, axis=0, keepdims=True)
                        R = beta2 * R + (1.0 - beta2) * np.mean(
                            np.square(G) + epsilon, axis=1, keepdims=True)
                        LTM = np.mean(R, keepdims=True)
                        X = G / (np.sqrt(R / LTM) * np.sqrt(C))
                        RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True))

                    else:

                        r = R
                        p, c, x, _ = sess.run(adafactor1d_op(p,
                                                             c,
                                                             g,
                                                             beta2,
                                                             learn_rate,
                                                             grad_scale,
                                                             clip_thresh,
                                                             [norm_scale],
                                                             epsilon=epsilon),
                                              feed_dict={g: G})

                        GN = np.sqrt(
                            np.sum(np.square(G * grad_scale), keepdims=True))
                        NS = clip_norm / np.maximum(GN, clip_norm)
                        G *= NS * grad_scale

                        C = beta2 * C + (1.0 - beta2) * (np.square(G) +
                                                         epsilon)
                        X = G / np.sqrt(C)
                        RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True))

                    P -= learn_rate * X / np.maximum(1.0, RMS_X / clip_thresh)

                    print("testAdafactor", dtype, GN, NS)
                    for op, dev, cpu in [
                        ["C", c, C],
                        ["R", r, R],
                        ["X", x, X],
                        ["P", p, P],
                    ]:
                        self.compare_results(op, dev, cpu)
Beispiel #12
0
def model(xs, ys, loss_scale=None, train=False):

    with tf.variable_scope("model", reuse=not train):

        with tf.device("/cpu:0"):
            if train:
                grad_scale = tf.reciprocal(loss_scale) if hps.float16 else 1.0
                global_step = tf.get_variable(
                    "global_step", [],
                    initializer=tf.ones_initializer(),
                    trainable=False)
                learning_rate = tf.minimum(
                    global_step * (1.0 / hps.warmup_iters), 1.0) * hps.lr
            mpi_scale = tf.constant(1.0 / mpi_size)

        with tf.device("/gpu:0"):

            # Contains scope/var_name substrings we use to group gradients for all reduce
            # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise bs.allreduce could hang.
            # The groups should be ordered in which the all-reduce is called.
            # Any gradients not matching the substrings will get appended to the last group.
            grad_groups = []

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = tf.get_variable(
                    "x", [hps.n_vocab, hps.n_state],
                    initializer=tf.random_normal_initializer(stddev=0.02))
                p_embed = tf.get_variable(
                    'pos', [1, hps.n_timesteps, hps.n_state],
                    initializer=tf.random_normal_initializer(stddev=0.01))

                if hps.float16:
                    x_embed = bs.float_cast(x_embed,
                                            dtype=tf.float16,
                                            dx_dtype=tf.float16)
                    p_embed = bs.float_cast(p_embed,
                                            dtype=tf.float16,
                                            dx_dtype=tf.float16)

                # bs.embedding_lookup can be much faster than tf version for low entropy indexes or small vocabs
                x = bs.embedding_lookup(x_embed, xs)

                if train and hps.embed_pdrop > 0.0:
                    # this part of the code is not recomputed so no need to remember the generated mask returned by bs.dropout
                    x, _ = bs.dropout(x, keep_prob=1.0 - hps.embed_pdrop)
                    p_embed, _ = bs.dropout(p_embed,
                                            keep_prob=1.0 - hps.embed_pdrop)

                h = x + p_embed
                grad_groups.insert(0, 'embed')

            for l in range(hps.n_layer):
                layer_name = 'layer_%d' % l
                # enable the recompute decorator in training
                # see blocksparse/grads.py if you want understand how this works
                h = transformer_block(h,
                                      layer_name,
                                      train=train,
                                      recompute=train and hps.recompute)
                grad_groups.insert(0, layer_name)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            if hps.float16:
                # much faster and more memory efficient (but currently only implemented in fp16)
                loss = bs.softmax_cross_entropy(logits=logits, labels=ys)
            else:
                labels = tf.cast(tf.reshape(ys, [-1]), tf.int32)
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=labels)

            loss = tf.reduce_mean(loss)

            if train:

                # apply loss scaling in fp16 mode
                if hps.float16:
                    grad_loss = bs.scale_tensor(loss, loss_scale)
                else:
                    grad_loss = loss

                # use bs.gradients to allow bs.recomputable decorators to work
                params = tf.trainable_variables()
                grads = bs.gradients(grad_loss, params)

                if mpi_size > 1:
                    # apply (1.0 / mpi_size) scaling prior to all_reduce to allow greater utilization of fp16 dynamic range.
                    # That is we're ok with flushing some small values to zero to allow growth of large values in allreduce (without hitting inf).
                    loss = bs.scale_tensor(loss, mpi_scale)
                    grads = [bs.scale_tensor(g, mpi_scale) for g in grads]

                    # allreduce in an mpi context
                    # bias and gain grads will be in fp32, but have them fp16 cast prior to allreduce
                    cast_all = tf.float16 if H.float16 else None
                    loss = bs.allreduce(loss)
                    grads = bs.group_allreduce(grads,
                                               params,
                                               search_strings=grad_groups,
                                               cast_all=cast_all)

                # This does not actually perform the clippiing, only measures the norm_scale needed to be applied.
                # norm_scale is then later applied in the fused optimizer ops (eliminating an extra pass over the gradients).
                # norm_scale is also used to detect inf/nan values in any of the gradients so the whole update can be skipped
                # and tried again with a new loss_scale.
                global_norm, norm_scale = bs.clip_by_global_norm(
                    grads, grad_scale=grad_scale, clip_norm=hps.clip_norm)

                # Apply AdamOptimizer:
                # fp16 mode is a special feature to store running mean and variance variables in custom fp16 formats.
                # Using this mode should incure no loss in accuracy and save a lot of memory in your model.
                # For futher memory savings consider using bs.AdafactorOptimizer.
                adam = bs.AdamOptimizer(learning_rate=learning_rate,
                                        norm_scale=norm_scale,
                                        grad_scale=grad_scale,
                                        fp16=hps.float16)

                train_op = adam.apply_gradients(zip(grads, params))

                # update global step after we're done using it for this update
                with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
                    update_op = tf.assign_add(global_step, 1.0)

                return loss, tf.group(train_op,
                                      update_op), global_norm, norm_scale

            else:
                if mpi_size > 1:
                    loss = bs.allreduce(bs.scale_tensor(loss, mpi_scale))

                return loss
    def testDropout(self):

        config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            bs.set_entropy()
            sess.run(tf.global_variables_initializer())

            # with tf.device("/gpu:0"):
            #     x = tf.ones([10000])*-10.0
            #     g = bs.concrete_gate(x)
            #     g = sess.run(g)
            #     print(g.sum()/g.size)

            # error = gradient_checker.compute_gradient_error(x, x.shape, g, g.shape) #, extra_feed_dict={ x: cpuX, m: mask }
            # print(error)

            for dtype in (tf.float16, ):  #tf.float16, tf.bfloat16
                for x_shape, mask_shapes in shapes:
                    for mask_shape in mask_shapes:

                        m_shape = x_shape if mask_shape is None else mask_shape

                        cpuO = np.ones(x_shape, dtype=np.float32)
                        cpuX = np.random.uniform(-1.0, 1.0, x_shape).astype(
                            np.float16).astype(np.float32)
                        cpuM = np.random.randint(0,
                                                 2,
                                                 size=m_shape,
                                                 dtype=np.bool)

                        mask = np.zeros(ceil_div(cpuM.size, 32) * 32,
                                        dtype=np.bool)
                        mask[:cpuM.size] = cpuM.reshape(-1)
                        mask = np.packbits(mask.reshape(-1, 8)[:, ::-1]).view(
                            np.int32)

                        cpuY = cpuX * cpuM.astype(np.float32) * 2.0

                        with tf.device("/gpu:0"):

                            x = tf.placeholder(tf.float32, cpuX.shape)
                            m = tf.placeholder(tf.int32, mask.shape)

                            xf = bs.float_cast(x, dtype=dtype)
                            y, _ = bs.dropout(xf,
                                              keep_prob=0.5,
                                              mask=m,
                                              mask_shape=mask_shape)
                            y = bs.float_cast(y, dtype=tf.float32)

                            devY, = sess.run([
                                y,
                            ],
                                             feed_dict={
                                                 x: cpuX,
                                                 m: mask
                                             })

                            xf = bs.float_cast(x, dtype=dtype)
                            y, _ = bs.dropout(xf,
                                              keep_prob=0.8,
                                              mask_shape=mask_shape)
                            y = bs.float_cast(y, dtype=tf.float32)

                            devO, = sess.run([
                                y,
                            ], feed_dict={x: cpuO})

                        diff = np.abs(devY - cpuY)
                        print(
                            "dype: %8s x_shape: %-20s m_shape: %-20s err: %4.2f norm_sum: %4.2f"
                            % (dtype.name, str(x_shape), str(mask_shape),
                               diff.sum(), devO.sum() / devO.size))
    def testBlocksparseMatMul(self):

        # layout = np.zeros((2,2), dtype=np.int32)
        # layout[0,0] = 1

        n, m = 160, 5
        layout = networkx.generators.barabasi_albert_graph(n, m)
        #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5)
        layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
        layout[0:m,0:m] = 1

        #layout[0:60,0:60] = 1
        #layout = np.zeros((4,4), dtype=np.int32)
        #layout = np.ones((4,4), dtype=np.int32)

        #layout[0,0] = 1

        #layout = np.ones((1,1), dtype=np.int32)
        blocks = layout.sum()
        n = layout.shape[0]
        print(100 * blocks / n**2)
        print(layout.sum(axis=0).max(), layout.sum(axis=0).min())
        #exit()

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            for bsize, axis in ( (32,0), (16,0), (8,0), ): # (32,1), (32,0), (16,0), (8,0)

                bsmm = bs.BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test")

                if one:

                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    for w in range(bsmm.blocks):
                        #c, k = bsmm.block_coord(w)
                        #if c == k:
                        W[w] = np.eye(bsmm.bsize, dtype=np.float32)

                    # W = np.ones(bsmm.w_shape, dtype=np.float32)
                    # W[:] += np.arange(32, dtype=np.float32).reshape(1,1,32)
                else:
                    # W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float16).astype(np.float32)
                    W = np.random.normal(loc=0.0, scale=0.01, size=bsmm.w_shape).astype(np.float16).astype(np.float32)



                # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32)
                # for w, (c, k) in enumerate(bsmm.updat_list):
                #     WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:]

                w = tf.constant(W)

                # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) )
                # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape)
                # print("identity_init: ", (s1 - s2).max())
                # exit()

                for N in (256,128,64,32,16,8,): # 128,64,32,16,1,  256,512,1024,2048,4096, 256,1024,4096,16384

                    if one:
                        X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                        E = np.ones(bsmm.o_shape(N), dtype=np.float32)

                        # X = np.eye(bsmm.bsize, dtype=np.float32)
                        # E = np.arange(X.size, dtype=np.float32).reshape(X.shape)

                        # X[:] += np.arange(X.size, dtype=np.float32).reshape(X.shape)
                        # X[:] += np.arange(32, dtype=np.float32).reshape(32,1)
                        # E[:] += np.arange(16, dtype=np.float32).reshape(1,32)
                        # X[:] += np.arange(64, dtype=np.float32).reshape(1,64)
                        # E[:] += np.arange(64, dtype=np.float32).reshape(1,64)
                    else:
                        # X = np.random.uniform(0.0, 10.0, bsmm.i_shape(N)).astype(np.float16).astype(np.float32)
                        # E = np.random.uniform(0.0, 10.0, bsmm.o_shape(N)).astype(np.float16).astype(np.float32)
                        X = np.random.normal(loc=0.0, scale=0.1, size=bsmm.i_shape(N)).astype(np.float16).astype(np.float32)
                        E = np.random.normal(loc=0.0, scale=0.1, size=bsmm.o_shape(N)).astype(np.float16).astype(np.float32)

                    x = tf.constant(X)
                    e = tf.constant(E)

                    for dtype in dtypes:

                        print("Axis:%d Bsize:%2d N:%d dtype:%s Params:%d" % (axis, bsize, N, dtype.name, bsize*bsize*blocks))

                        # compute in tensorflow
                        if l2norm:
                            w2 = bsmm.l2_normalize(w, dtype=dtype)
                        else:
                            w2 = bs.float_cast(w, dtype=dtype)

                        y = bs.float_cast(x, dtype=dtype)

                        for j in range(depth):
                            repeat = bench if bench and j==depth-1 else 0
                            y = bsmm(y, w2, bench=repeat) # (bench and j==depth-1) (bench and j==0)

                        y = bs.float_cast(y, dtype=tf.float32)

                        #if bench: sess.run( y )
                        #y = sess.run( y )
                        with tf.control_dependencies([y.op]):
                            d = bs.gradients(y, [x, w], e)
                        if depth > 1:
                            d[1] = bs.group_param_grads(d[1], 8)

                        sess.run(tf.global_variables_initializer())


                        #y, = sess.run( [y] )
                        y, (dx, dw) = sess.run( [y, d ] )

                        if not bench:
                            # compute in numpy
                            if l2norm:
                                W2 = bsmm.l2_normalize_test(W)
                            else:
                                W2 = W

                            Ys = [X]
                            for j in range(depth):
                                Ys.append(bsmm.fprop_test(Ys[-1], W2))
                            Y = Ys.pop()

                            DW = np.zeros(bsmm.w_shape, dtype=np.float32)
                            DX = E
                            for j in range(depth):
                                DW += bsmm.updat_test(Ys.pop(), DX)
                                DX  = bsmm.bprop_test(DX, W2)
                            if l2norm:
                                DW = bsmm.l2_normalize_grad_test(W, DW)

                            for op, cpuA, devA in (
                                (" y:",  Y,  y),
                                ("dx:", DX, dx),
                                ("dw:", DW, dw),
                            ):

                                difA = abs(cpuA - devA)

                                avgval  = np.average(abs(cpuA))
                                maxdif  = difA.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval

                                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum())

                                #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                                print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                                # rtol = 1e-4 if dtF is tf.float32 else 1e-1
                                # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                                if out:
                                    np.savetxt("out.txt",  difA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    np.savetxt("outC.txt", cpuA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    np.savetxt("outD.txt", devA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f')
                                    exit()
                            print("")
    def atestBlocksparseMatMulGated(self):

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            N = 128
            K = 8*56*2*4
            n = K//8
            m = 30
            dtype = tf.float32
            repeat = 0
            dw_gated = False
            block_size = 8

            layout = networkx.generators.barabasi_albert_graph(n, m)
            layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
            layout[0:m,0:m] = 1

            blocks = layout.sum()
            n = layout.shape[0]
            print(100 * blocks / n**2)
            print(layout.sum(axis=0).max())

            # layout = np.ones((112,32), dtype=np.int32)
            bsmm = bs.BlocksparseMatMul(layout, block_size=block_size, feature_axis=0, name="test")

            if one:
                X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                W = np.ones(bsmm.w_shape   , dtype=np.float32)
                G = np.ones(bsmm.blocks    , dtype=np.float32)
            else:
                X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32)
                W = np.random.uniform(-1.0, 1.0, bsmm.w_shape   ).astype(np.float32)
                G = np.random.uniform( 0.0, 1.0, bsmm.blocks    ).astype(np.float32)

            G = np.ones(bsmm.blocks, dtype=np.float32)
            for w, (c, k) in enumerate(bsmm.updat_list):
                G[w] = (c & 1) ^ (k & 1) ^ 1

            #G[::2] = 0.0

            # block = dict()
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     block[(c,k)] = w

            # grid = []
            # for c in range(bsmm.CB):
            #     row = []
            #     for k in range(bsmm.KB):
            #         row.append(G[block[(c,k)]])
            #     grid.append(row)

            # for row in grid:
            #     print(row)

            # exit()


            x = tf.constant(X)
            e = tf.constant(E)
            w = tf.constant(W)
            g = tf.constant(G)

            wf = bs.float_cast(w, dtype=dtype)
            xf = bs.float_cast(x, dtype=dtype)

            y = bsmm(xf, wf, gate=g, gate_grad=True, dw_gated=dw_gated, bench=repeat)

            y = bs.float_cast(y, dtype=tf.float32)

            d = bs.gradients(y, [x, w], e)

            sess.run( tf.global_variables_initializer() )

            y, (dx, dw) = sess.run( [y, d] )

            # gpu kernel doesn't touch zero gate blocks
            # for b in range(bsmm.blocks):
            #     if G[b] == 0.0:
            #         dw[b,:,:] = 0.0

            Y  = bsmm.fprop_test(X, W, gate=G)
            DX = bsmm.bprop_test(E, W, gate=G)
            DW = bsmm.updat_test(X, E, gate=G, dw_gated=dw_gated)

            #print(Y.shape, dtype)

            for op, cpuA, devA in (
                (" y:",  Y,  y),
                ("dx:", DX, dx),
                ("dw:", DW, dw),):

                difA = abs(cpuA - devA)

                avgval  = np.average(abs(cpuA))
                maxdif  = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum() + 1e-12)

                print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                if out:
                    dim = K if op == "dw:" else N
                    np.savetxt("out.txt",  difA.reshape((-1,dim)), fmt='%5.1f')
                    np.savetxt("outC.txt", cpuA.reshape((-1,dim)), fmt='%5.1f')
                    np.savetxt("outD.txt", devA.reshape((-1,dim)), fmt='%5.1f')
                    exit()