def testTranspose(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for shape in shapes: cpuX = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) x = tf.placeholder(tf.float32, shape, name="x") for dtype in (tf.float16, tf.float32): #tf.float16, tf.float32 xf = bs.float_cast(x, dtype=dtype) y = bs.transpose_2d(xf) y = bs.float_cast(y, dtype=tf.float32) Y = tf.transpose(xf) Y = bs.float_cast(Y, dtype=tf.float32) y, Y = sess.run([y, Y], feed_dict={x: cpuX}) dif = np.abs(Y - y) avgval = np.average(abs(Y)) maxdif = dif.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(dif).sum()) / np.sqrt( np.square(Y).sum()) print("%s, shape:%16s, err:%17.12f, l2_err:%17.12f" % (dtype.name, str(shape), max_err, l2_err))
def atestGateGrad(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): dtype = tf.float16 layout = np.ones([2, 2], dtype=np.bool) bsmm = bs.BlocksparseMatMul(layout, block_size=8, feature_axis=0, name="test") X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(64)).astype( np.float16).astype(np.float32) W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype( np.float16).astype(np.float32) G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype( np.float16).astype(np.float32) #G = np.ones([bsmm.blocks], dtype=np.float32) x = tf.constant(X) w = tf.constant(W) g = tf.constant(G) wf = bs.float_cast(w, dtype=dtype) xf = bs.float_cast(x, dtype=dtype) y = bsmm(xf, wf, gate=g, gate_grad=True, bench=0) y = bs.float_cast(y, dtype=tf.float32) sess.run(tf.global_variables_initializer()) # y = sess.run( y ) # exit() error = gradient_checker.compute_gradient_error( x, x.shape, y, y.shape) #, extra_feed_dict={ x: cpuX, m: mask } print(error) error = gradient_checker.compute_gradient_error( w, w.shape, y, y.shape) #, extra_feed_dict={ x: cpuX, m: mask } print(error) error = gradient_checker.compute_gradient_error( g, g.shape, y, y.shape) #, extra_feed_dict={ x: cpuX, m: mask } print(error)
def testSoftmaxCrossEntropy(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): N = 3 # 80 * 16 for K in (10, 256, 512, 1024*8, 1024*16, 1024*32, 1024*64,): #10, 256, 512, 1024*8, 1024*16, 1024*32, 1024*64 np.random.seed(int(time())) #cpuX = np.random.uniform(-20.0, 20.0, (N, K)).astype(np.float16).astype(np.float32) #65504 cpuX = np.random.normal(0.0, 1.0, (N, K)).astype(np.float16).astype(np.float32) cpuE = np.random.normal(0.0, 1.0, (N, )).astype(np.float16).astype(np.float32) cpuI = np.random.randint(0, K, size=(N, ), dtype=np.uint16) x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) i = tf.placeholder(tf.uint16, cpuI.shape) feed_dict = { x: cpuX, i: cpuI, e: cpuE } xf = bs.float_cast(x, dtype=tf.float16) y = bs.softmax_cross_entropy(logits=xf, labels=i) Y = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=tf.cast(i, tf.int32)) y, (dx,) = sess.run( [ y, tf.gradients(y, [x], e) ], feed_dict ) Y, (DX,) = sess.run( [ Y, tf.gradients(Y, [x], e) ], feed_dict ) print("testSoftmaxCrossEntropy", K) if not bench: for op, dev, cpu in [ [ "Y", y, Y ], [ "DX", dx, DX ], ]: self.compare_results(op, dev, cpu)
def conv1d(x, scope, nf, std=0.02, relu=False, fast_gelu=False): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims # Note: param initializers are not particularly well tuned in this code w = tf.get_variable( "w", [nx, nf], initializer=tf.random_normal_initializer(stddev=std)) b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0)) if hps.float16: # We delay weight casting till just before use to minimize memory footprint. # In recompute mode these casts are released just after use on forward pass, # then remade on the recompute pass. with tf.control_dependencies([x.op]): # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass. # Our all-reduce and fused optimizers can accept fp16 natively. w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16) # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([tf.shape(x)[:ndims - 1], [nf]], axis=0) x = tf.reshape(x, [-1, nx]) y = tf.matmul(x, w) # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) if ndims > 2: y = tf.reshape(y, y_shape) return y
def testBlocksparseTransformerDense(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): batch = 2 heads = 2 state = 64*2 scale = 1.0 / np.sqrt(state/heads) for bsize in (8, 16, 32, 64): ctxQ = 16 ctxK = 16 layout = np.ones([heads, ctxQ, ctxK], dtype=np.bool) bst = bs.BlocksparseTransformer(layout, block_size=bsize) shapeQ = (batch, ctxQ*bsize, heads*state) shapeK = (batch, ctxK*bsize, heads*state) if ones: cpuQ = np.ones(shapeQ, dtype=np.float32) cpuK = np.ones(shapeK, dtype=np.float32) cpuV = np.ones(shapeK, dtype=np.float32) cpuE = np.ones(shapeQ, dtype=np.float32) else: cpuQ = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32) cpuV = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32) q = tf.placeholder(tf.float32, shapeQ) k = tf.placeholder(tf.float32, shapeK) v = tf.placeholder(tf.float32, shapeK) e = tf.placeholder(tf.float32, shapeQ) feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE } qf = bs.float_cast(q, dtype=tf.float16) kf = bs.float_cast(k, dtype=tf.float16) vf = bs.float_cast(v, dtype=tf.float16) w = bst.query_key_op(qf, kf, bench=bench) w = bst.softmax(w, scale=scale) y = bst.weight_value_op(w, vf, bench=bench) qf = bs.transpose_0213(tf.reshape(qf, [batch, ctxQ*bsize, heads, state])) kf = bs.transpose_0213(tf.reshape(kf, [batch, ctxK*bsize, heads, state])) vf = bs.transpose_0213(tf.reshape(vf, [batch, ctxK*bsize, heads, state])) W = tf.matmul(qf, kf, transpose_b=True) W = bs.softmax(W, scale=scale) Y = tf.matmul(W, vf) Y = tf.reshape(bs.transpose_0213(Y), [batch, ctxQ*bsize, heads*state]) y = bs.float_cast(y, dtype=tf.float32) Y = bs.float_cast(Y, dtype=tf.float32) y, (dq, dk, dv) = sess.run( [ y, tf.gradients(y, [q, k, v], e) ], feed_dict ) Y, (DQ, DK, DV) = sess.run( [ Y, tf.gradients(Y, [q, k, v], e) ], feed_dict ) print("testBlocksparseTransformerDense", bsize) if not bench: for op, dev, cpu in [ [ " Y", y, Y ], [ "DV", dv, DV ], [ "DK", dk, DK ], [ "DQ", dq, DQ ], ]: self.compare_results(op, dev, cpu)
def atestBlocksparseSoftmax(self): batch = 1 heads = 1 key = 7 def checker_callback(blk_shape, head_idx, qry_idx, key_idx, blk_idx): mask = np.ones(blk_shape, dtype=np.bool) mask[::2,1::2] = False mask[1::2,::2] = False return mask with self.test_session(config=config) as sess, tf.device("/gpu:0"): # for ctx in (16, 32, 64, 128, 256, 512, 1024, 2048, 4096): #16, 32, 64, 128, 256, 512, 1024, 2048, 4096 # for bsize in (8, 16, 32, 64,): # 8, 16, 32, 64, # if bsize * (ctx+0) <= 32768: for ctx in (16,): #16, 32, 64, 128, 256, 512, 1024, 2048, 4096 for bsize in (8, 16, 32, 64, ): # 8, 16, 32, 64, if bsize * (ctx) <= 32768: # define outer block structure for blocksparse matmul layout = np.ones([heads, ctx, ctx], dtype=np.bool) bst = bs.BlocksparseTransformer(layout, heads=heads, block_size=bsize, mask_callback=checker_callback) # checker_callback shape = (batch, heads, bst.blocks, bsize, bsize) print(shape) if ones: cpuX = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) else: cpuX = np.random.normal(0.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuE = np.random.normal(0.0, 1.0, shape).astype(np.float16).astype(np.float32) # np.savetxt("cpuX.txt", cpuX.reshape((-1,bsize)), fmt='%5.2f') # for i, a in enumerate(np.max(cpuX.reshape(-1,bsize), axis=1)): # print("%2d %.2f" % (i, a)) # print() x = tf.placeholder(tf.float32, cpuX.shape) e = tf.placeholder(tf.float32, cpuE.shape) feed_dict = { x: cpuX, e: cpuE } xf = bs.float_cast(x, dtype=tf.bfloat16) y = bst.masked_softmax(xf, scale=0.5, autoregress_at_key=key) y = bs.float_cast(y, dtype=tf.float32) dx, = tf.gradients(y, [ x ], e) y, dx = sess.run( [ y, dx ], feed_dict ) Y = bst.masked_softmax_test(cpuX, scale=0.5, autoregress_at_key=key) DX = bst.masked_softmax_grad_test(cpuE, Y, scale=0.5) print("testBlocksparseSoftmax", ctx*bsize, bsize) for op, dev, cpu in [ [ "Y", y, Y ], [ "DX", dx, DX ], ]: self.compare_results(op, dev, cpu)
def testBlocksparseTransformerMatmul(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for bsize in ( 32, ): # 8, 16, 32, 64 dtype_qk = tf.float32 dtype_w = tf.bfloat16 ones = 0 bench = 0 batch = 2 heads = 4 ctx = 16 state = 64*2 scale = 1.0 # / np.sqrt(state/heads) ctxQ = ctx ctxK = ctx # *2 layout = np.ones([1, ctxQ, ctxK], dtype=np.bool) for q, k in np.ndindex(ctx, ctx): if k > q: layout[:,q,k] = 0 #layout[:,0,:] = 1 bst = bs.BlocksparseTransformer(layout, heads=heads, block_size=bsize, mask_callback=mask_callback) q_shape = (batch, ctxQ*bsize, heads*state) k_shape = (batch, ctxK*bsize, heads*state) w_shape = (batch, heads, bst.blocks, bsize, bsize) if ones: cpuQ = np.ones(q_shape, dtype=np.float32) cpuK = np.ones(k_shape, dtype=np.float32) cpuW = np.ones(w_shape, dtype=np.float32) # cpuQ[0,:,:] = np.eye(bsize, dtype=np.float32) # cpuK[0,:,:] = np.eye(bsize, dtype=np.float32) # cpuW[0,0,0,:,:] = np.eye(bsize, dtype=np.float32) # cpuQ[0,0,0,:] = 1 # cpuK[0,0,0,:] = range(64) # cpuW[0,0,0,0,:] = 1 else: cpuQ = np.random.uniform(-1.0, 1.0, q_shape).astype(np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, k_shape).astype(np.float16).astype(np.float32) cpuW = np.random.uniform(-1.0, 1.0, w_shape).astype(np.float16).astype(np.float32) q = tf.placeholder(tf.float32, cpuQ.shape) k = tf.placeholder(tf.float32, cpuK.shape) w = tf.placeholder(tf.float32, cpuW.shape) feed_dict = { q: cpuQ, k: cpuK, w: cpuW } qf = bs.float_cast(q, dtype=dtype_qk) kf = bs.float_cast(k, dtype=dtype_qk) wf = bs.float_cast(w, dtype=dtype_w) nt = bst.nt_op(qf, kf, bench=bench) nn = bst.nn_op(wf, kf, bench=bench) tn = bst.tn_op(wf, qf, bench=bench) nt = bs.float_cast(nt, dtype=tf.float32) nn = bs.float_cast(nn, dtype=tf.float32) tn = bs.float_cast(tn, dtype=tf.float32) print("testBlocksparseTransformerMatmul", bsize) nt, nn, tn = sess.run( [ nt, nn, tn ], feed_dict ) # nt, nn, tn if not bench: NT = bst.nt_test(cpuQ, cpuK) NN = bst.nn_test(cpuW, cpuK) TN = bst.tn_test(cpuW, cpuQ) for op, dev, cpu in [ [ "NT", nt, NT ], [ "NN", nn, NN ], [ "TN", tn, TN ], ]: self.compare_results(op, dev, cpu)
def testBlocksparseTransformerSparse(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): batch = 2 heads = 2 ctx = 16 state = 64*2 scale = 1.0 / np.sqrt(state/heads) dtype = tf.float32 for bsize in ( 32, ): # 8, 16, 32, 64 layout = np.ones([heads, ctx, ctx], dtype=np.bool) for q, k in np.ndindex(ctx, ctx): if k > q: layout[:,q,k] = 0 bst = bs.BlocksparseTransformer(layout, block_size=bsize, mask_callback=mask_callback) shape = (batch, ctx*bsize, heads*state) if ones: cpuQ = np.ones(shape, dtype=np.float32) cpuK = np.ones(shape, dtype=np.float32) cpuV = np.ones(shape, dtype=np.float32) cpuE = np.ones(shape, dtype=np.float32) else: cpuQ = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuV = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32) q = tf.placeholder(tf.float32, shape) k = tf.placeholder(tf.float32, shape) v = tf.placeholder(tf.float32, shape) e = tf.placeholder(tf.float32, shape) feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE } qf = bs.float_cast(q, dtype=dtype) kf = bs.float_cast(k, dtype=dtype) vf = bs.float_cast(v, dtype=dtype) w = bst.query_key_op(qf, kf) a = bst.masked_softmax(w, scale=scale) y = bst.weight_value_op(a, vf) w = bs.float_cast(w, dtype=tf.float32) a = bs.float_cast(a, dtype=tf.float32) y = bs.float_cast(y, dtype=tf.float32) dq, dk, dv = tf.gradients(y, [q, k, v], e) w, a, y, dq, dk, dv = sess.run( [ w, a, y, dq, dk, dv ], feed_dict ) W = bst.nt_test(cpuQ, cpuK) A = bst.masked_softmax_test(W, scale=scale) Y = bst.nn_test(A, cpuV) DV = bst.tn_test( A, cpuE) DW = bst.nt_test(cpuE, cpuV) DW = bst.masked_softmax_grad_test(DW, A, scale=scale) DQ = bst.nn_test( DW, cpuK) DK = bst.tn_test( DW, cpuQ) print("testBlocksparseTransformerSparse", 32) if not bench: for op, dev, cpu in [ [ "W", w, W ], [ "A", a, A ], [ "Y", y, Y ], [ "DV", dv, DV ], [ "DK", dk, DK ], [ "DQ", dq, DQ ], ]: self.compare_results(op, dev, cpu)
def testAdafactor(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for dtype in (tf.float32, tf.float16): # tf.float16 for shape in ( (1, ), (3, ), (127), (1, 1024), (1023, 1024), (1024, 1024), ): if ones: G = np.ones(shape, dtype=np.float32) P = np.ones(shape, dtype=np.float32) M = np.zeros(shape, dtype=np.float32) V = np.zeros(shape, dtype=np.float32) else: G = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) P = np.random.uniform(-1.0, 1.0, shape).astype( np.float16).astype(np.float32) M = np.random.uniform(0.0, 1.0, shape).astype( np.float16).astype(np.float32) V = np.random.uniform(0.0, 1.0, shape).astype( np.float16).astype(np.float32) g = tf.placeholder(tf.float32, G.shape) p = tf.Variable(initial_value=P, name="p") m = tf.Variable(initial_value=M, name="m") v = tf.Variable(initial_value=V, name="v") sess.run(tf.global_variables_initializer()) g = bs.float_cast(g, dtype=dtype) global_norm, norm_scale = bs.clip_by_global_norm( [g], grad_scale=grad_scale, clip_norm=clip_norm) p, m, v = sess.run(adam_op(g, p, m, v, learn_rate, grad_scale, clip_sigma, [norm_scale], [], decay_mean=beta1, decay_var=beta2, epsilon=epsilon), feed_dict={g: G}) GN = np.sqrt( np.sum(np.square(G * grad_scale), keepdims=True)) NS = clip_norm / np.maximum(GN, clip_norm) G *= NS * grad_scale M = beta1 * M + (1.0 - beta1) * G V = beta2 * V + (1.0 - beta2) * G * G P -= learn_rate * M / (np.sqrt(V) + epsilon) print("testAdam", dtype, GN, NS) for op, dev, cpu in [ ["M", m, M], ["V", v, V], ["P", p, P], ]: self.compare_results(op, dev, cpu)
def testBlocksparseReducedDW(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): ones = 0 norm = 0 accum = 0 blocks_x = 2 blocks_y = 4 bsize = 32 axis = 0 depth = 8 N = 64 scale = 1.0 / (N * depth) shape_x = [N, N] shape_y = [N, N] shape_w = (blocks_x, blocks_y) shape_x[axis] = bsize * blocks_x shape_y[axis] = bsize * blocks_y XS = list() YS = list() if ones: for i in range(depth): XS.append(np.ones(shape_x, dtype=np.float32)) YS.append(np.ones(shape_y, dtype=np.float32)) if accum: DWA = np.ones(shape_w, dtype=np.float32) XS[0][:] += np.arange(64, dtype=np.float32).reshape(1,64) else: for i in range(depth): XS.append(np.random.normal(0.0, 1.0, shape_x).astype(np.float16).astype(np.float32)) YS.append(np.random.normal(0.0, 1.0, shape_y).astype(np.float16).astype(np.float32)) if accum: DWA = np.random.normal(0.0, 1.0, shape_w).astype(np.float32) feed_dict = dict() xs = list() ys = list() for i in range(depth): x = tf.placeholder(tf.float32, shape_x, name=f"x{i}") y = tf.placeholder(tf.float32, shape_y, name=f"y{i}") feed_dict[x] = XS[i] feed_dict[y] = YS[i] xs.append(bs.float_cast(x, dtype=tf.float16)) ys.append(bs.float_cast(y, dtype=tf.float16)) if accum: dwa = tf.placeholder(tf.float32, DWA.shape, name=f"dwa") feed_dict[dwa] = DWA #dwa = bs.float_cast(dwa, dtype=tf.float16) dw, x_red, y_red = blocksparse_reduced_dw(xs, ys, scale, [dwa], bsize=bsize, norm=norm, axis=axis) else: dw, x_red, y_red = blocksparse_reduced_dw(xs, ys, scale, [ ], bsize=bsize, norm=norm, axis=axis) #dw = bs.float_cast(dw, dtype=tf.float32) x_red = bs.float_cast(x_red, dtype=tf.float32) y_red = bs.float_cast(y_red, dtype=tf.float32) dw, x_red, y_red = sess.run([dw, x_red, y_red], feed_dict=feed_dict) if axis == 0: X_RED = np.zeros([blocks_x, depth, N], dtype=np.float32) Y_RED = np.zeros([blocks_y, depth, N], dtype=np.float32) for i in range(depth): X = XS[i].reshape([blocks_x, bsize, N]) Y = YS[i].reshape([blocks_y, bsize, N]) if norm == 0: X_RED[:,i,:] = np.max(np.abs(X), axis=1) Y_RED[:,i,:] = np.max(np.abs(Y), axis=1) else: X_RED[:,i,:] = np.sqrt(np.sum(np.square(X), axis=1)) Y_RED[:,i,:] = np.sqrt(np.sum(np.square(Y), axis=1)) DW = np.dot(X_RED.reshape(blocks_x, -1), Y_RED.reshape(blocks_y, -1).T) * scale else: X_RED = np.zeros([depth, N, blocks_x], dtype=np.float32) Y_RED = np.zeros([depth, N, blocks_y], dtype=np.float32) for i in range(depth): X = XS[i].reshape([N, blocks_x, bsize]) Y = YS[i].reshape([N, blocks_y, bsize]) if norm == 0: X_RED[i,:,:] = np.max(np.abs(X), axis=2) Y_RED[i,:,:] = np.max(np.abs(Y), axis=2) else: X_RED[i,:,:] = np.sqrt(np.sum(np.square(X), axis=2)) Y_RED[i,:,:] = np.sqrt(np.sum(np.square(Y), axis=2)) DW = np.dot(X_RED.reshape(-1, blocks_x).T, Y_RED.reshape(-1, blocks_y)) * scale if accum: DW += DWA print("BlocksparseReducedDW", norm, bsize, depth) for op, dev, cpu in [ [ "xr", x_red, X_RED ], [ "yr", y_red, Y_RED ], [ "dw", dw, DW ], ]: #print(op, dev.shape, cpu.shape) self.compare_results(op, dev, cpu)
def testAdafactor(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): for dtype in (tf.float32, tf.float16): # tf.float16 for shape_g in ( (1024, 1024 * 2), (1, 1024 * 2), (1024, 1023 * 1), (1, 1023 * 1), ): shape_c = (1, shape_g[1]) shape_r = (shape_g[0], 1) if ones: G = np.ones(shape_g, dtype=np.float32) P = np.ones(shape_g, dtype=np.float32) C = np.zeros(shape_c, dtype=np.float32) R = np.zeros(shape_r, dtype=np.float32) else: G = np.random.uniform(-1.0, 1.0, shape_g).astype( np.float16).astype(np.float32) P = np.random.uniform(-1.0, 1.0, shape_g).astype( np.float16).astype(np.float32) C = np.random.uniform(0.0, 1.0, shape_c).astype( np.float16).astype(np.float32) R = np.random.uniform(0.0, 1.0, shape_r).astype( np.float16).astype(np.float32) g = tf.placeholder(tf.float32, G.shape) p = tf.Variable(initial_value=P, name="p") c = tf.Variable(initial_value=C, name="c") r = tf.Variable(initial_value=R, name="r") sess.run(tf.global_variables_initializer()) g = bs.float_cast(g, dtype=dtype) # adafactor has it's own fused infinity filtering but quick test of this standalone op here. g = bs.filter_tensor(g) global_norm, norm_scale = bs.clip_by_global_norm( [g], grad_scale=grad_scale, clip_norm=clip_norm) if shape_g[0] > 1: p, c, r, x, _ = sess.run(adafactor2d_op( p, c, r, g, beta2, learn_rate, grad_scale, clip_thresh, [norm_scale], epsilon=epsilon), feed_dict={g: G}) GN = np.sqrt( np.sum(np.square(G * grad_scale), keepdims=True)) NS = clip_norm / np.maximum(GN, clip_norm) G *= NS * grad_scale C = beta2 * C + (1.0 - beta2) * np.mean( np.square(G) + epsilon, axis=0, keepdims=True) R = beta2 * R + (1.0 - beta2) * np.mean( np.square(G) + epsilon, axis=1, keepdims=True) LTM = np.mean(R, keepdims=True) X = G / (np.sqrt(R / LTM) * np.sqrt(C)) RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True)) else: r = R p, c, x, _ = sess.run(adafactor1d_op(p, c, g, beta2, learn_rate, grad_scale, clip_thresh, [norm_scale], epsilon=epsilon), feed_dict={g: G}) GN = np.sqrt( np.sum(np.square(G * grad_scale), keepdims=True)) NS = clip_norm / np.maximum(GN, clip_norm) G *= NS * grad_scale C = beta2 * C + (1.0 - beta2) * (np.square(G) + epsilon) X = G / np.sqrt(C) RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True)) P -= learn_rate * X / np.maximum(1.0, RMS_X / clip_thresh) print("testAdafactor", dtype, GN, NS) for op, dev, cpu in [ ["C", c, C], ["R", r, R], ["X", x, X], ["P", p, P], ]: self.compare_results(op, dev, cpu)
def model(xs, ys, loss_scale=None, train=False): with tf.variable_scope("model", reuse=not train): with tf.device("/cpu:0"): if train: grad_scale = tf.reciprocal(loss_scale) if hps.float16 else 1.0 global_step = tf.get_variable( "global_step", [], initializer=tf.ones_initializer(), trainable=False) learning_rate = tf.minimum( global_step * (1.0 / hps.warmup_iters), 1.0) * hps.lr mpi_scale = tf.constant(1.0 / mpi_size) with tf.device("/gpu:0"): # Contains scope/var_name substrings we use to group gradients for all reduce # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise bs.allreduce could hang. # The groups should be ordered in which the all-reduce is called. # Any gradients not matching the substrings will get appended to the last group. grad_groups = [] # embed discrete inputs to continous space and add learned position embeddings with tf.variable_scope('embed'): x_embed = tf.get_variable( "x", [hps.n_vocab, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02)) p_embed = tf.get_variable( 'pos', [1, hps.n_timesteps, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01)) if hps.float16: x_embed = bs.float_cast(x_embed, dtype=tf.float16, dx_dtype=tf.float16) p_embed = bs.float_cast(p_embed, dtype=tf.float16, dx_dtype=tf.float16) # bs.embedding_lookup can be much faster than tf version for low entropy indexes or small vocabs x = bs.embedding_lookup(x_embed, xs) if train and hps.embed_pdrop > 0.0: # this part of the code is not recomputed so no need to remember the generated mask returned by bs.dropout x, _ = bs.dropout(x, keep_prob=1.0 - hps.embed_pdrop) p_embed, _ = bs.dropout(p_embed, keep_prob=1.0 - hps.embed_pdrop) h = x + p_embed grad_groups.insert(0, 'embed') for l in range(hps.n_layer): layer_name = 'layer_%d' % l # enable the recompute decorator in training # see blocksparse/grads.py if you want understand how this works h = transformer_block(h, layer_name, train=train, recompute=train and hps.recompute) grad_groups.insert(0, layer_name) #average pool transformer features and apply linear classifier with tf.variable_scope('logits'): h = tf.reshape(h, [-1, hps.n_state]) logits = tf.matmul(h, x_embed, transpose_b=True) if hps.float16: # much faster and more memory efficient (but currently only implemented in fp16) loss = bs.softmax_cross_entropy(logits=logits, labels=ys) else: labels = tf.cast(tf.reshape(ys, [-1]), tf.int32) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) loss = tf.reduce_mean(loss) if train: # apply loss scaling in fp16 mode if hps.float16: grad_loss = bs.scale_tensor(loss, loss_scale) else: grad_loss = loss # use bs.gradients to allow bs.recomputable decorators to work params = tf.trainable_variables() grads = bs.gradients(grad_loss, params) if mpi_size > 1: # apply (1.0 / mpi_size) scaling prior to all_reduce to allow greater utilization of fp16 dynamic range. # That is we're ok with flushing some small values to zero to allow growth of large values in allreduce (without hitting inf). loss = bs.scale_tensor(loss, mpi_scale) grads = [bs.scale_tensor(g, mpi_scale) for g in grads] # allreduce in an mpi context # bias and gain grads will be in fp32, but have them fp16 cast prior to allreduce cast_all = tf.float16 if H.float16 else None loss = bs.allreduce(loss) grads = bs.group_allreduce(grads, params, search_strings=grad_groups, cast_all=cast_all) # This does not actually perform the clippiing, only measures the norm_scale needed to be applied. # norm_scale is then later applied in the fused optimizer ops (eliminating an extra pass over the gradients). # norm_scale is also used to detect inf/nan values in any of the gradients so the whole update can be skipped # and tried again with a new loss_scale. global_norm, norm_scale = bs.clip_by_global_norm( grads, grad_scale=grad_scale, clip_norm=hps.clip_norm) # Apply AdamOptimizer: # fp16 mode is a special feature to store running mean and variance variables in custom fp16 formats. # Using this mode should incure no loss in accuracy and save a lot of memory in your model. # For futher memory savings consider using bs.AdafactorOptimizer. adam = bs.AdamOptimizer(learning_rate=learning_rate, norm_scale=norm_scale, grad_scale=grad_scale, fp16=hps.float16) train_op = adam.apply_gradients(zip(grads, params)) # update global step after we're done using it for this update with tf.control_dependencies([train_op]), tf.device("/cpu:0"): update_op = tf.assign_add(global_step, 1.0) return loss, tf.group(train_op, update_op), global_norm, norm_scale else: if mpi_size > 1: loss = bs.allreduce(bs.scale_tensor(loss, mpi_scale)) return loss
def testDropout(self): config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with self.test_session(config=config) as sess: bs.set_entropy() sess.run(tf.global_variables_initializer()) # with tf.device("/gpu:0"): # x = tf.ones([10000])*-10.0 # g = bs.concrete_gate(x) # g = sess.run(g) # print(g.sum()/g.size) # error = gradient_checker.compute_gradient_error(x, x.shape, g, g.shape) #, extra_feed_dict={ x: cpuX, m: mask } # print(error) for dtype in (tf.float16, ): #tf.float16, tf.bfloat16 for x_shape, mask_shapes in shapes: for mask_shape in mask_shapes: m_shape = x_shape if mask_shape is None else mask_shape cpuO = np.ones(x_shape, dtype=np.float32) cpuX = np.random.uniform(-1.0, 1.0, x_shape).astype( np.float16).astype(np.float32) cpuM = np.random.randint(0, 2, size=m_shape, dtype=np.bool) mask = np.zeros(ceil_div(cpuM.size, 32) * 32, dtype=np.bool) mask[:cpuM.size] = cpuM.reshape(-1) mask = np.packbits(mask.reshape(-1, 8)[:, ::-1]).view( np.int32) cpuY = cpuX * cpuM.astype(np.float32) * 2.0 with tf.device("/gpu:0"): x = tf.placeholder(tf.float32, cpuX.shape) m = tf.placeholder(tf.int32, mask.shape) xf = bs.float_cast(x, dtype=dtype) y, _ = bs.dropout(xf, keep_prob=0.5, mask=m, mask_shape=mask_shape) y = bs.float_cast(y, dtype=tf.float32) devY, = sess.run([ y, ], feed_dict={ x: cpuX, m: mask }) xf = bs.float_cast(x, dtype=dtype) y, _ = bs.dropout(xf, keep_prob=0.8, mask_shape=mask_shape) y = bs.float_cast(y, dtype=tf.float32) devO, = sess.run([ y, ], feed_dict={x: cpuO}) diff = np.abs(devY - cpuY) print( "dype: %8s x_shape: %-20s m_shape: %-20s err: %4.2f norm_sum: %4.2f" % (dtype.name, str(x_shape), str(mask_shape), diff.sum(), devO.sum() / devO.size))
def testBlocksparseMatMul(self): # layout = np.zeros((2,2), dtype=np.int32) # layout[0,0] = 1 n, m = 160, 5 layout = networkx.generators.barabasi_albert_graph(n, m) #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5) layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32) layout[0:m,0:m] = 1 #layout[0:60,0:60] = 1 #layout = np.zeros((4,4), dtype=np.int32) #layout = np.ones((4,4), dtype=np.int32) #layout[0,0] = 1 #layout = np.ones((1,1), dtype=np.int32) blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max(), layout.sum(axis=0).min()) #exit() with self.test_session(config=conf) as sess, tf.device("/gpu:0"): for bsize, axis in ( (32,0), (16,0), (8,0), ): # (32,1), (32,0), (16,0), (8,0) bsmm = bs.BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") if one: W = np.ones(bsmm.w_shape, dtype=np.float32) for w in range(bsmm.blocks): #c, k = bsmm.block_coord(w) #if c == k: W[w] = np.eye(bsmm.bsize, dtype=np.float32) # W = np.ones(bsmm.w_shape, dtype=np.float32) # W[:] += np.arange(32, dtype=np.float32).reshape(1,1,32) else: # W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float16).astype(np.float32) W = np.random.normal(loc=0.0, scale=0.01, size=bsmm.w_shape).astype(np.float16).astype(np.float32) # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:] w = tf.constant(W) # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) ) # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape) # print("identity_init: ", (s1 - s2).max()) # exit() for N in (256,128,64,32,16,8,): # 128,64,32,16,1, 256,512,1024,2048,4096, 256,1024,4096,16384 if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) # X = np.eye(bsmm.bsize, dtype=np.float32) # E = np.arange(X.size, dtype=np.float32).reshape(X.shape) # X[:] += np.arange(X.size, dtype=np.float32).reshape(X.shape) # X[:] += np.arange(32, dtype=np.float32).reshape(32,1) # E[:] += np.arange(16, dtype=np.float32).reshape(1,32) # X[:] += np.arange(64, dtype=np.float32).reshape(1,64) # E[:] += np.arange(64, dtype=np.float32).reshape(1,64) else: # X = np.random.uniform(0.0, 10.0, bsmm.i_shape(N)).astype(np.float16).astype(np.float32) # E = np.random.uniform(0.0, 10.0, bsmm.o_shape(N)).astype(np.float16).astype(np.float32) X = np.random.normal(loc=0.0, scale=0.1, size=bsmm.i_shape(N)).astype(np.float16).astype(np.float32) E = np.random.normal(loc=0.0, scale=0.1, size=bsmm.o_shape(N)).astype(np.float16).astype(np.float32) x = tf.constant(X) e = tf.constant(E) for dtype in dtypes: print("Axis:%d Bsize:%2d N:%d dtype:%s Params:%d" % (axis, bsize, N, dtype.name, bsize*bsize*blocks)) # compute in tensorflow if l2norm: w2 = bsmm.l2_normalize(w, dtype=dtype) else: w2 = bs.float_cast(w, dtype=dtype) y = bs.float_cast(x, dtype=dtype) for j in range(depth): repeat = bench if bench and j==depth-1 else 0 y = bsmm(y, w2, bench=repeat) # (bench and j==depth-1) (bench and j==0) y = bs.float_cast(y, dtype=tf.float32) #if bench: sess.run( y ) #y = sess.run( y ) with tf.control_dependencies([y.op]): d = bs.gradients(y, [x, w], e) if depth > 1: d[1] = bs.group_param_grads(d[1], 8) sess.run(tf.global_variables_initializer()) #y, = sess.run( [y] ) y, (dx, dw) = sess.run( [y, d ] ) if not bench: # compute in numpy if l2norm: W2 = bsmm.l2_normalize_test(W) else: W2 = W Ys = [X] for j in range(depth): Ys.append(bsmm.fprop_test(Ys[-1], W2)) Y = Ys.pop() DW = np.zeros(bsmm.w_shape, dtype=np.float32) DX = E for j in range(depth): DW += bsmm.updat_test(Ys.pop(), DX) DX = bsmm.bprop_test(DX, W2) if l2norm: DW = bsmm.l2_normalize_grad_test(W, DW) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum()) #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err)) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) # rtol = 1e-4 if dtF is tf.float32 else 1e-1 # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol) if out: np.savetxt("out.txt", difA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') np.savetxt("outC.txt", cpuA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') np.savetxt("outD.txt", devA.reshape((-1,cpuA.shape[-1])), fmt='%4.0f') exit() print("")
def atestBlocksparseMatMulGated(self): with self.test_session(config=conf) as sess, tf.device("/gpu:0"): N = 128 K = 8*56*2*4 n = K//8 m = 30 dtype = tf.float32 repeat = 0 dw_gated = False block_size = 8 layout = networkx.generators.barabasi_albert_graph(n, m) layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32) layout[0:m,0:m] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) # layout = np.ones((112,32), dtype=np.int32) bsmm = bs.BlocksparseMatMul(layout, block_size=block_size, feature_axis=0, name="test") if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) W = np.ones(bsmm.w_shape , dtype=np.float32) G = np.ones(bsmm.blocks , dtype=np.float32) else: X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) W = np.random.uniform(-1.0, 1.0, bsmm.w_shape ).astype(np.float32) G = np.random.uniform( 0.0, 1.0, bsmm.blocks ).astype(np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) for w, (c, k) in enumerate(bsmm.updat_list): G[w] = (c & 1) ^ (k & 1) ^ 1 #G[::2] = 0.0 # block = dict() # for w, (c, k) in enumerate(bsmm.updat_list): # block[(c,k)] = w # grid = [] # for c in range(bsmm.CB): # row = [] # for k in range(bsmm.KB): # row.append(G[block[(c,k)]]) # grid.append(row) # for row in grid: # print(row) # exit() x = tf.constant(X) e = tf.constant(E) w = tf.constant(W) g = tf.constant(G) wf = bs.float_cast(w, dtype=dtype) xf = bs.float_cast(x, dtype=dtype) y = bsmm(xf, wf, gate=g, gate_grad=True, dw_gated=dw_gated, bench=repeat) y = bs.float_cast(y, dtype=tf.float32) d = bs.gradients(y, [x, w], e) sess.run( tf.global_variables_initializer() ) y, (dx, dw) = sess.run( [y, d] ) # gpu kernel doesn't touch zero gate blocks # for b in range(bsmm.blocks): # if G[b] == 0.0: # dw[b,:,:] = 0.0 Y = bsmm.fprop_test(X, W, gate=G) DX = bsmm.bprop_test(E, W, gate=G) DW = bsmm.updat_test(X, E, gate=G, dw_gated=dw_gated) #print(Y.shape, dtype) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw),): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum() + 1e-12) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100*max_err, l2_err)) if out: dim = K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1,dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1,dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1,dim)), fmt='%5.1f') exit()