Ejemplo n.º 1
0
 def sparse_matmul(self, x, feature_axis, output_dim):
   """
   :param tf.Tensor x:
   :param int feature_axis:
   :param int output_dim:
   :return: y, weights, bsmm
   :rtype: (tf.Tensor, tf.Variable, object)
   """
   block_size = self.block_size
   input_dim = x.get_shape().dims[feature_axis].value
   assert input_dim is not None, "%r shape unknown" % (x,)
   assert input_dim % block_size == 0 and output_dim % block_size == 0
   from blocksparse.matmul import BlocksparseMatMul
   seed = self.random.randint(2 ** 31)
   sparsity_pattern = sparsity_pattern_barabasi_albert(
     n1=input_dim // block_size, n2=output_dim // block_size, m=self.connectivity, dense=self.connectivity_dense,
     seed=seed)
   bsmm = BlocksparseMatMul(sparsity_pattern, block_size=block_size, feature_axis=feature_axis)
   if self.weights_identity_init:
     weights_init = bsmm.identity_init()
   else:
     weights_init = None
   weights = tf.get_variable("W", shape=bsmm.w_shape, initializer=weights_init)
   y = bsmm(x, weights)
   return y, weights, bsmm
Ejemplo n.º 2
0
def test_blocksparse_simple_feature_axis1():
  init_blocksparse()

  from blocksparse.matmul import BlocksparseMatMul
  import tensorflow as tf
  import numpy

  n_in = 64
  n_out = 32 * 32
  block_size = 32
  n_batch = 4

  # Create a dense sparsity pattern
  mask = numpy.ones((n_in // block_size, n_out // block_size), dtype=numpy.int32)
  # MatMul object
  bsmm = BlocksparseMatMul(mask, block_size=block_size, feature_axis=1, name="bsmm")
  # Input
  x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape((n_batch, n_in)) + 1.0
  x = tf.constant(x_np, name='x')
  # Block-sparse weights
  w_np = bsmm.identity_init()()
  w = tf.constant(w_np, name="w")
  # Block-sparse matrix multiplication
  y = bsmm(x, w)
  y.set_shape((n_batch, n_out))
  # Run
  result = session.run(y)
  print(result)
  print('L2:', numpy.sum(result ** 2))
  y_test = bsmm.fprop_test(x_np, w_np)
  print(y_test)
  assert_allclose(result, y_test)
    def atestBlocksparseMatMulCPU(self):
        # n, m = 64*8, 64
        # #layout = networkx.generators.barabasi_albert_graph(n, m)
        # layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .2)
        # layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
        # layout[0:m,0:m] = 1

        # blocks = layout.sum()
        # print(100 * blocks / n**2)
        # print(layout.sum(axis=0).max())

        with self.test_session(config=conf) as sess, tf.device("/cpu:0"):
            for bsize, axis in ((32, 0), (16, 0),
                                (8, 0)):  # (32,0), (16,0), (8,0)

                layout = np.ones((4 * 1024 // bsize, 4 * 1024 // bsize),
                                 dtype=np.int32)

                bsmm = BlocksparseMatMul(layout,
                                         block_size=bsize,
                                         feature_axis=axis,
                                         name="test")

                if one:
                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    X = np.ones(bsmm.i_shape(1), dtype=np.float32)
                else:
                    W = np.random.uniform(-1.0, 1.0,
                                          bsmm.w_shape).astype(np.float32)
                    X = np.random.uniform(-1.0, 1.0,
                                          bsmm.i_shape(1)).astype(np.float32)

                w = tf.constant(W)
                x = tf.constant(X)
                y = sess.run(bsmm(x, w, bench=bench))

                #start = time()
                Y = bsmm.fprop_test(X, W)
                #print("np time:", round(time() - start, 2))

                difY = abs(Y - y)

                avgval = np.average(abs(Y))
                maxdif = difY.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difY).sum()) / np.sqrt(
                    np.square(Y).sum())

                print("cpu max_err%%: %11.8f L2_err: %12.10f" %
                      (100 * max_err, l2_err))
Ejemplo n.º 4
0
def test_blocksparse_simple():
    init_blocksparse()

    from blocksparse.matmul import BlocksparseMatMul
    import tensorflow as tf
    import numpy as np

    hidden_size = 4096
    block_size = 32
    minibatch_size = 64

    # Create a (random) sparsity pattern
    sparsity = np.random.randint(2,
                                 size=(hidden_size // block_size,
                                       hidden_size // block_size))

    # Initialize the sparse matrix multiplication object
    bsmm = BlocksparseMatMul(sparsity, block_size=block_size, feature_axis=0)

    # Input to graph
    x = tf.placeholder(tf.float32, shape=[hidden_size, None])
    x_np = np.ones((hidden_size, minibatch_size), dtype='float32')

    # Initialize block-sparse weights
    w = tf.get_variable("w",
                        bsmm.w_shape,
                        dtype=tf.float32,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=3))

    # Block-sparse matrix multiplication
    y = bsmm(x, w)

    # Run
    print('init vars')
    session.run(tf.global_variables_initializer())
    print('blocksparse matmul')
    result = session.run(y, feed_dict={x: x_np})
    print(result)
    print('test')
    w_np = session.run(w)
    y_test = bsmm.fprop_test(x_np, w_np)
    print(y_test)
    i = numpy.argmax((y_test - result)**2)
    print('biggest diff at %i: %r vs %r' %
          (i, y_test.flatten()[i], result.flatten()[i]))
    assert_allclose(result, y_test, rtol=1e-2)  # rtol=1e-03 still fails
Ejemplo n.º 5
0
def test_blocksparse_simple():
    init_blocksparse()

    from blocksparse.matmul import BlocksparseMatMul
    import tensorflow as tf
    import numpy as np

    hidden_size = 4096
    block_size = 32
    minibatch_size = 64

    # Create a (random) sparsity pattern
    sparsity = np.random.randint(2,
                                 size=(hidden_size // block_size,
                                       hidden_size // block_size))

    # Initialize the sparse matrix multiplication object
    bsmm = BlocksparseMatMul(sparsity, block_size=block_size, feature_axis=0)

    # Input to graph
    x = tf.placeholder(tf.float32, shape=[None, hidden_size])

    # Initialize block-sparse weights
    w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32)

    # Block-sparse matrix multiplication
    y = bsmm(x, w)

    # Run
    session.run(tf.global_variables_initializer())
    result = session.run(
        [y],
        feed_dict={x: np.ones((minibatch_size, hidden_size), dtype='float32')})
    print(result)
Ejemplo n.º 6
0
def test_blocksparse_simple_identity():
    init_blocksparse()

    from blocksparse.matmul import BlocksparseMatMul
    import tensorflow as tf
    import numpy

    n_in = 64
    n_out = 32 * 32
    block_size = 32
    # Note: It seems everything less than 4 fails, as well as non-power-of-2.
    n_batch = 4

    # Create a dense sparsity pattern
    mask = numpy.ones((n_in // block_size, n_out // block_size),
                      dtype=numpy.int32)
    # MatMul object
    bsmm = BlocksparseMatMul(mask,
                             block_size=block_size,
                             feature_axis=0,
                             name="bsmm")
    # Input
    x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape(
        (n_in, n_batch)) + 1.0
    x = tf.constant(x_np, name='x')
    # Block-sparse weights
    w_np = bsmm.identity_init()()
    w = tf.constant(w_np, name="w")
    #for b in range(bsmm.blocks):
    #  cb, kb = bsmm.updat_list[b]
    #  print("block %i/%i, cb %i/%i, kb %i/%i" % (b, bsmm.blocks, cb, bsmm.KB, kb, bsmm.CB))
    # Block-sparse matrix multiplication
    y = bsmm(x, w)
    y.set_shape((n_out, n_batch))
    # Run
    result = session.run(y)
    print(result)
    print('L2:', numpy.sum(result**2))
    y_test = bsmm.fprop_test(x_np, w_np)
    print(y_test)
    i = numpy.argmax((y_test - result)**2)
    print('biggest diff at %i: %r vs %r' %
          (i, y_test.flatten()[i], result.flatten()[i]))
    assert_allclose(result, y_test, rtol=1e-2)
def test_sparse_dense_bsr_gray(minibatch_size, N, K, BS_R, BS_C, density):
    """Run and profile BSR dense with tensorflow"""
    print("testing param", minibatch_size, N, K, BS_R, BS_C, density)
    # Initialize the sparse matrix multiplication object
    feature_axis = 0 if BS_R in [8, 16] else 1

    # Create a (random) sparsity pattern
    sparsity = random_bsr_matrix_helper(K, N, BS_R, BS_C, density, 'float32')
    bsmm = BlocksparseMatMul(sparsity,
                             block_size=BS_R,
                             feature_axis=feature_axis)
    # Initialize block-sparse weights
    w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32)

    if feature_axis == 0:
        # Input to graph
        x = tf.get_variable("x", [K, minibatch_size], dtype=tf.float32)
    else:
        # Input to graph
        x = tf.get_variable("x", [minibatch_size, K], dtype=tf.float32)

    # Block-sparse matrix multiplication
    y = bsmm(x, w)

    # Run
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())

    run_metadata = tf.RunMetadata()
    sess.run([y],
             run_metadata=run_metadata,
             options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE))

    # Print to stdout an analysis of the memory usage and the timing information
    # broken down by python codes.
    ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder

    # Print to stdout an analysis of the memory usage and the timing information
    # broken down by operation types.
    tf.profiler.profile(
        tf.get_default_graph(),
        run_meta=run_metadata,
        cmd='op',
        options=tf.profiler.ProfileOptionBuilder.time_and_memory())

    tf.reset_default_graph()
Ejemplo n.º 8
0
def bench_mm_openai(x, w, mode, trans_a, trans_b, layout, block, num_repeat):
  # import and disable all logging
  import os
  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
  import warnings
  warnings.filterwarnings('ignore',category=FutureWarning)
  from blocksparse.matmul import BlocksparseMatMul
  from blocksparse.transformer import BlocksparseTransformer
  import tensorflow as tf
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
  import numpy as np
  sparsity = layout.cpu().numpy()
  # create operator
  transformer = BlocksparseTransformer(sparsity, heads=layout.shape[0], block_size=block)
  dot_sdd_nt = transformer.nt_op
  dot_dsd_tn = transformer.tn_op
  dot_dsd_nn = transformer.nn_op
  dot_dds_nn = None if mode != 'dds' else BlocksparseMatMul(sparsity[0,:,:], block_size=block)
  key = (mode, trans_a, trans_b)
  ops = {('sdd', False, True): dot_sdd_nt,
         ('dsd', True, False): dot_dsd_tn,
         ('dsd', False, False): dot_dsd_nn,
         ('dds', False, False): dot_dds_nn}
  if x.dtype == torch.float32 and (mode == 'dsd' or block != 32):
    return None
  if key not in ops:
    return None
  if mode == 'dds' and x.shape[0]*x.shape[1] != 1:
    return None
  op = ops[key]
  # placeholder
  x = x.view(x.shape[0]*x.shape[1], x.shape[2], x.shape[3])
  w = w.view(w.shape[0]*w.shape[1], w.shape[2], w.shape[3])
  sparse_shape = [x.shape[0], layout.shape[0], layout[0].sum(), block, block]
  vx = tf.placeholder(tf.float32, shape = sparse_shape if mode == 'dsd' else x.shape)
  vw = tf.placeholder(tf.float32, shape = sparse_shape if mode == 'dds' else w.shape)
  x = np.random.rand(*sparse_shape) if mode == 'dsd' else x.cpu().detach().numpy()
  w = np.random.rand(*sparse_shape) if mode == 'dds' else w.cpu().detach().numpy()
  # Block-sparse matrix multiplication
  y = op(vx, vw, bench=num_repeat)
  # Run
  sess = tf.InteractiveSession()
  sess.run(tf.global_variables_initializer())
  result = sess.run([y], feed_dict = {vx: x, vw: w})
  sess.close()
Ejemplo n.º 9
0
def profile(batch_size, input_size, output_size, block_size, sparsity):
    num_input_blocks = input_size / block_size
    num_output_blocks = output_size / block_size
    num_blocks = num_input_blocks * num_output_blocks
    num_pruned_blocks = int(num_blocks * sparsity)
    num_remain_blocks = num_blocks - num_pruned_blocks

    actual_sparsity = num_pruned_blocks / float(num_blocks)

    # generate layout
    layout = np.array([0] * num_pruned_blocks + [1] * num_remain_blocks)
    np.random.shuffle(layout)
    layout = layout.reshape((num_input_blocks, num_output_blocks))

    # generate shuffle order
    indices = range(output_size)
    random.shuffle(indices)
    
    tf.reset_default_graph()
    with tf.Session() as sess:
	bsmm = BlocksparseMatMul(layout, block_size=block_size)
	i = tf.constant(indices)
       	x = tf.placeholder(tf.float32, shape=(batch_size, input_size))
	w = tf.get_variable('w', bsmm.w_shape, dtype=tf.float32)
	y = bsmm(x, w)
	y = tf.gather(y, i, axis=1)

	options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
	run_metadata = tf.RunMetadata()
	sess.run(tf.global_variables_initializer())
	sess.run(y, feed_dict={x: np.ones((batch_size, input_size), dtype='float32')}, options=options, run_metadata=run_metadata)
	fetched_timeline = timeline.Timeline(run_metadata.step_stats)
	chrome_trace = fetched_timeline.generate_chrome_trace_format()
    with open('timeline.json', 'w') as f:
	f.write(chrome_trace)

    with open('timeline.json', 'r') as f:
	o = json.load(f)['traceEvents']
	mm_time = int(next(item for item in o if item['name'] == u'BlocksparseMatmul')['dur'])
	gather_time = int(next(item for item in o if item['name'].startswith(u'Gather'))['dur'])

    os.remove('timeline.json')

    return actual_sparsity, mm_time + gather_time
Ejemplo n.º 10
0
    def gen_masks(self):
        hps = self.hps
        hps.bsmm = bsmm = dict()

        assert hps.nhidden % hps.block_size == 0
        assert hps.nembd % 32 == 0

        # Create block-sparse matmul ops (to be shared by all instances of the model)
        # We only need 1 instance of the lut constants
        with tf.name_scope("BlocksparseMatMul"):

            if hps.nproj_in != hps.nhidden:
                # assume small projection values are acutally strides
                if hps.nproj_in <= hps.block_size * 4:
                    hps.sproj_mul = SparseProj(hps.nhidden,
                                               proj_stride=hps.nproj_in)
                    hps.sproj_add = SparseProj(hps.nhidden,
                                               proj_stride=hps.nproj_in)
                    hps.nproj_in = hps.sproj_mul.nproj
                else:
                    hps.sproj_mul = SparseProj(hps.nhidden, nproj=hps.nproj_in)
                    hps.sproj_add = SparseProj(hps.nhidden, nproj=hps.nproj_in)
            else:
                hps.sproj_mul = None
                hps.sproj_add = None

            if hps.nproj_out != hps.nhidden:
                # assume small projection values are acutally strides
                if hps.nproj_out <= hps.block_size * 4:
                    hps.sproj_out = SparseProj(hps.nhidden,
                                               proj_stride=hps.nproj_out,
                                               block_size=32)
                    hps.nproj_out = hps.sproj_out.nproj
                else:
                    hps.sproj_out = SparseProj(hps.nhidden,
                                               nproj=hps.nproj_out)
            else:
                hps.sproj_out = None

            # for the input and output projections, use the largest block size that fits
            blk_in, nproj_in = largest_block(hps.nproj_in)
            blk_out, nproj_out = largest_block(hps.nproj_out)

            nhidden = hps.nhidden // hps.block_size
            nembd = hps.nembd // blk_in
            nvocab = ceil_div(hps.nvocab, blk_out)

            # the dense input mask
            mask = np.ones((nembd, nproj_in), dtype=np.int32)
            bsmm["x"] = BlocksparseMatMul(mask,
                                          block_size=blk_in,
                                          feature_axis=hps.axis,
                                          name="lstm_x")

            istep_masks = []
            if hps.share_masks:
                # all gates and internal steps get the same mask
                mask = masks.make_mask(n=nhidden, kind=hps.sparsity)
                bsmm_p = BlocksparseMatMul(mask,
                                           block_size=hps.block_size,
                                           feature_axis=hps.axis,
                                           name="lstm_h")

                for p in list("ifou") + ["h%d" % i for i in range(hps.isteps)]:
                    bsmm[p] = bsmm_p

                istep_masks = [mask for i in range(hps.isteps + 1)]
            else:
                # internal steps get different masks
                for p in ["h%d" % i for i in range(hps.isteps)]:
                    mask = masks.make_mask(n=nhidden, kind=hps.sparsity)
                    bsmm[p] = BlocksparseMatMul(mask,
                                                block_size=hps.block_size,
                                                feature_axis=hps.axis,
                                                name="lstm_%s" % p)
                    istep_masks.append(mask)

                # gates get the same mask (TODO: experiment here with differen masks)
                mask = masks.make_mask(n=nhidden, kind=hps.sparsity)
                bsmm_p = BlocksparseMatMul(mask,
                                           block_size=hps.block_size,
                                           feature_axis=hps.axis,
                                           name="lstm_g")
                for p in list("ifou"):
                    bsmm[p] = bsmm_p

                istep_masks.append(mask)

            # the output mask
            mask = np.ones((nproj_out, nvocab), dtype=np.int32)
            bsmm["y"] = BlocksparseMatMul(mask,
                                          block_size=blk_out,
                                          feature_axis=hps.axis,
                                          name="lstm_o")

            hps.mix_factor = masks.mix_factor(istep_masks)
            hps.sparsity += " (%.4f%%)" % (100.0 * bsmm["u"].sparsity)
    def testBlocksparseMatMul(self):

        # layout = np.zeros((2,2), dtype=np.int32)
        # layout[0,0] = 1

        n, m = 56 * 8, 8
        layout = networkx.generators.barabasi_albert_graph(n, m)
        #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5)
        layout = networkx.adjacency_matrix(layout).toarray().astype(
            np.int32) + np.eye(n, dtype=np.int32)
        layout[0:m, 0:m] = 1

        #layout[0:60,0:60] = 1
        #layout = np.zeros((4,4), dtype=np.int32)
        #layout = np.ones((28*12,28*12), dtype=np.int32)
        #layout[0,0] = 1

        blocks = layout.sum()
        n = layout.shape[0]
        print(100 * blocks / n**2)
        print(layout.sum(axis=0).max())
        #exit()

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            for bsize, axis in (
                (32, 1),
                (32, 0),
                (16, 0),
                (8, 0),
            ):  # (32,1), (32,0), (16,0), (8,0)

                bsmm = BlocksparseMatMul(layout,
                                         block_size=bsize,
                                         feature_axis=axis,
                                         name="test")

                if one:
                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    #W[:] += np.arange(8, dtype=np.float32).reshape(1,8)
                else:
                    W = np.random.uniform(-1.0, 1.0,
                                          bsmm.w_shape).astype(np.float32)

                # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32)
                # for w, (c, k) in enumerate(bsmm.updat_list):
                #     WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:]

                w = tf.constant(W)

                # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) )
                # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape)
                # print("identity_init: ", (s1 - s2).max())

                for N in (64, ):  # 128,64,32,16,1,

                    if one:
                        X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                        E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                        #X[:] += np.arange(8, dtype=np.float32).reshape(8,1)
                    else:
                        X = np.random.uniform(
                            -1.0, 1.0, bsmm.i_shape(N)).astype(np.float32)
                        E = np.random.uniform(
                            -1.0, 1.0, bsmm.o_shape(N)).astype(np.float32)

                    x = tf.constant(X)
                    e = tf.constant(E)

                    for dtF, dtB in dtypes:

                        print("Axis:%d Bsize:%2d N:%d F:%s B:%s Params:%d" %
                              (axis, bsize, N, dtF.name, dtB.name,
                               bsize * bsize * blocks))

                        # compute in tensorflow
                        if l2norm:
                            w2 = bsmm.l2_normalize(w, dtype=dtF)
                        else:
                            w2 = ew.float_cast(w, dtype=dtF)

                        y = ew.float_cast(x, dtype=dtF)

                        for j in range(depth):
                            repeat = bench if bench and j == depth - 1 else 0
                            y = bsmm(
                                y, w2, dw_dtype=dtF, bench=repeat
                            )  # (bench and j==depth-1) (bench and j==0)

                        y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtB)
                        if bench: sess.run(y)
                        #y = sess.run( y )

                        d = tf.gradients(y, [x, w], e, aggregation_method=am)
                        if depth > 1:
                            d[1] = group_param_grads(d[1], 8)

                        y, (dx, dw) = sess.run([y, d])

                        if not bench:
                            # compute in numpy
                            if l2norm:
                                W2 = bsmm.l2_normalize_test(W)
                            else:
                                W2 = W

                            # YY = np.dot(WW.T, X)
                            # ZZ = np.dot(WW  , E)
                            # uu = np.dot( X  , E.T)
                            # UU = np.zeros(bsmm.w_shape, dtype=np.float32)
                            # for w, (c, k) in enumerate(bsmm.updat_list):
                            #     UU[w,:,:] = uu[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize]

                            Ys = [X]
                            for j in range(depth):
                                Ys.append(bsmm.fprop_test(Ys[-1], W2))
                            Y = Ys.pop()

                            DW = np.zeros(bsmm.w_shape, dtype=np.float32)
                            DX = E
                            for j in range(depth):
                                DW += bsmm.updat_test(Ys.pop(), DX)
                                DX = bsmm.bprop_test(DX, W2)
                            if l2norm:
                                DW = bsmm.l2_normalize_grad_test(W, DW)

                            for op, cpuA, devA in (
                                    # ("YY:", YY,  y),
                                    # ("ZZ:", ZZ, dx),
                                    # ("UU:", UU, dw),
                                (" y:", Y, y),
                                ("dx:", DX, dx),
                                ("dw:", DW, dw),
                            ):

                                difA = abs(cpuA - devA)

                                avgval = np.average(abs(cpuA))
                                maxdif = difA.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval

                                l2_err = np.sqrt(
                                    np.square(difA).sum()) / np.sqrt(
                                        np.square(cpuA).sum())

                                #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                                      (op, 100 * max_err, l2_err))

                                # rtol = 1e-4 if dtF is tf.float32 else 1e-1
                                # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                                if out:
                                    dim = bsmm.K if op == "dw:" else N
                                    np.savetxt("out.txt",
                                               difA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outC.txt",
                                               cpuA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outD.txt",
                                               devA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    exit()
                            print("")
    def atestBlocksparseMatMulGated(self):

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            N = 128
            K = 8 * 56 * 2 * 4
            n = K // 8
            m = 30
            dtype = tf.bfloat16
            repeat = 10000

            layout = networkx.generators.barabasi_albert_graph(n, m)
            layout = networkx.adjacency_matrix(layout).toarray().astype(
                np.int32) + np.eye(n, dtype=np.int32)
            layout[0:m, 0:m] = 1

            blocks = layout.sum()
            n = layout.shape[0]
            print(100 * blocks / n**2)
            print(layout.sum(axis=0).max())

            # layout = np.ones((112,32), dtype=np.int32)
            bsmm = BlocksparseMatMul(layout,
                                     block_size=8,
                                     feature_axis=0,
                                     name="test")

            if one:
                X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                W = np.ones(bsmm.w_shape, dtype=np.float32)
                G = np.ones(bsmm.blocks, dtype=np.float32)
            else:
                X = np.random.uniform(-1.0, 1.0,
                                      bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0,
                                      bsmm.o_shape(N)).astype(np.float32)
                W = np.random.uniform(-1.0, 1.0,
                                      bsmm.w_shape).astype(np.float32)
                G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(np.float32)

            G = np.ones(bsmm.blocks, dtype=np.float32)
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     G[w] = (c & 1) ^ (k & 1) ^ 1

            #G[::2] = 0.0

            # block = dict()
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     block[(c,k)] = w

            # grid = []
            # for c in range(bsmm.CB):
            #     row = []
            #     for k in range(bsmm.KB):
            #         row.append(G[block[(c,k)]])
            #     grid.append(row)

            # for row in grid:
            #     print(row)

            # exit()

            x = tf.constant(X)
            e = tf.constant(E)
            w = tf.constant(W)
            g = tf.constant(G)

            w2 = ew.float_cast(w, dtype=dtype)
            y = ew.float_cast(x, dtype=dtype)

            y = bsmm(y, w2, gate=g, bench=repeat)

            y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype)

            d = tf.gradients(y, [x, w], e)

            y, (dx, dw) = sess.run([y, d])

            # gpu kernel doesn't touch zero gate blocks
            # for b in range(bsmm.blocks):
            #     if G[b] == 0.0:
            #         dw[b,:,:] = 0.0

            Y = bsmm.fprop_test(X, W, gate=G)
            DX = bsmm.bprop_test(E, W, gate=G)
            DW = bsmm.updat_test(X, E, gate=G)

            #print(Y.shape, dtype)

            for op, cpuA, devA in (
                (" y:", Y, y),
                ("dx:", DX, dx),
                ("dw:", DW, dw),
            ):

                difA = abs(cpuA - devA)

                avgval = np.average(abs(cpuA))
                maxdif = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(
                    np.square(cpuA).sum() + 1e-12)

                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                      (op, 100 * max_err, l2_err))

                if out:
                    dim = K if op == "dw:" else N
                    np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f')
                    np.savetxt("outC.txt",
                               cpuA.reshape((-1, dim)),
                               fmt='%5.1f')
                    np.savetxt("outD.txt",
                               devA.reshape((-1, dim)),
                               fmt='%5.1f')
                    exit()
Ejemplo n.º 13
0
sys.path.insert(0, "..")  # blocksparse base
sys.path.insert(0, "../../..")  # Returnn base

from blocksparse.matmul import BlocksparseMatMul
import tensorflow as tf
import numpy as np

hidden_size = 4096
block_size = 32
minibatch_size = 64

# Create a (random) sparsity pattern
sparsity = np.random.randint(2, size=(hidden_size//block_size,hidden_size//block_size))

# Initialize the sparse matrix multiplication object
bsmm = BlocksparseMatMul(sparsity, block_size=block_size)

# Input to graph
x = tf.placeholder(tf.float32, shape=[None, hidden_size])

# Initialize block-sparse weights
w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32)

# Block-sparse matrix multiplication
y = bsmm(x, w)

# Run
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
result = sess.run([y], feed_dict = {x: np.ones((minibatch_size,hidden_size), dtype='float32')})
print(result)
                if mask == "ws":
                    layout = networkx.generators.random_graphs.watts_strogatz_graph(
                        n, m * 2, .2)
                    layout = networkx.adjacency_matrix(layout).toarray(
                    ).astype(np.int32) + np.eye(n, dtype=np.int32)
                else:
                    layout = networkx.generators.barabasi_albert_graph(n, m)
                    layout = networkx.adjacency_matrix(layout).toarray(
                    ).astype(np.int32) + np.eye(n, dtype=np.int32)
                    layout[0:m, 0:m] = 1

            # print("axis:%d bsize:%2d hsize:%d params:%d sparsity:%.2f m:%d" % (axis, bsize, hsize, bsize*bsize*blks, spar, m))
            # continue

            bsmm = BlocksparseMatMul(layout,
                                     block_size=bsize,
                                     feature_axis=axis,
                                     name="test")

            W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32)
            w = tf.constant(W)

            for N in (64, ):  # 128,64,32,16,1,

                X = np.random.uniform(-1.0, 1.0,
                                      bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0,
                                      bsmm.o_shape(N)).astype(np.float32)
                x = tf.constant(X)
                e = tf.constant(E)

                for dtype in (tf.bfloat16, ):  # tf.bfloat16, tf.bfloat32,