def testBiasRelu(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:
            for shape in shapes:

                # shape[0] //= 24
                # shape[0]  *= 512

                if ones:
                    cpuX = np.ones(shape, dtype=p.float32)
                    cpuE = np.ones(shape, dtype=p.float32)
                    cpuB = np.ones(shape[1:], dtype=p.float32)
                else:
                    cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float16).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0, shape[1:]).astype(np.float32)

                for relu in (True, False):
                    for dtype in (tf.float32, ):  #tf.float16, tf.bfloat16

                        results = []
                        for device in ("gpu", "cpu"):
                            if bench and device == "cpu":
                                break

                            cast = device == "gpu" and dtype is not tf.float32

                            with tf.device("/%s:0" % device), tf.name_scope(device):

                                x = tf.placeholder(tf.float32, cpuX.shape)
                                e = tf.placeholder(tf.float32, cpuE.shape)
                                b = tf.placeholder(tf.float32, cpuB.shape)

                                feed_dict = { x: cpuX, e: cpuE, b:cpuB }

                                xc = ew.float_cast(x, dtype=dtype) if cast else x

                                y = ew.bias_relu(xc, b, relu=relu, atomics=atomics, bench=bench)

                                if cast:
                                    y = ew.float_cast(y, dtype=tf.float32)

                                dx, db = tf.gradients(y, [x, b], e)

                                results.append( sess.run( [ y, dx, db ], feed_dict ) )

                        if not bench:
                            for op, dev, cpu in zip(["y", "dx", "db"], results[0], results[1]):

                                dif     = np.abs(cpu - dev)
                                avgval  = np.average(abs(cpu))
                                maxdif  = dif.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval
                                l2_err  = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum())

                                print("%s, shape:%14s, op:%3s(%d), err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, relu, maxdif, l2_err))
Esempio n. 2
0
def group_allreduce(grads,
                    parms,
                    search_strings=None,
                    cast_map=None,
                    cast_all=None,
                    num_comms=2,
                    prereduce=0):

    # if no grouping specified, create one group to reduce at the end (no overlap with compute)
    if search_strings is None:
        search_strings = ["group_allreduce_all"]

    groups = [(name, list(), list()) for name in search_strings]

    for i, (grad, param) in enumerate(zip(grads, parms)):
        for name, group16, group32 in groups:
            if name == search_strings[-1] or name in param.name:

                if cast_all is not None:
                    grad = float_cast(grad, dtype=cast_all)

                elif cast_map is not None and name in cast_map:
                    grad = float_cast(grad, dtype=cast_map[name])

                if grad.dtype.base_dtype is tf.float16:
                    group16.append((i, grad, param))
                else:
                    group32.append((i, grad, param))
                break

    for name, group16, group32 in groups:
        count = 0
        for group in (group16, group32):
            count += len(group)
            if len(group) > 0:
                if len(group) == 1:
                    concated = group[0][1]
                else:
                    concated = tf.concat(
                        [tf.reshape(grad, [-1]) for _, grad, _ in group],
                        0,
                        name="concat_" + name)

                reduced = allreduce(concated,
                                    num_comms=num_comms,
                                    prereduce=prereduce)

                if len(group) == 1:
                    grads[group[0][0]] = reduced
                else:
                    offset = 0
                    for i, grad, param in group:
                        size = param.shape.num_elements()
                        grads[i] = tf.reshape(reduced[offset:offset + size],
                                              param.shape)
                        offset += size

        if count == 0:
            print("Warning: no grads found for all_reduce group: ", name)
    def testBlocksparseSoftmax(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for bsize in (
                    8,
                    16,
                    32,
                    64,
            ):  # 16, 32, 64

                # define outer block structure for blocksparse matmul
                layout = np.ones([1, ctx, ctx], dtype=np.bool)
                for q, k in np.ndindex(ctx, ctx):
                    if k > q:
                        layout[:, q, k] = 0
                #print(layout[0])

                bst = trans.BlocksparseTransformer(layout,
                                                   heads=heads,
                                                   block_size=bsize,
                                                   mask_callback=mask_callback)

                shape = (batch, heads, bst.blocks, bsize, bsize)

                if ones:
                    cpuX = np.ones(shape, dtype=np.float32)
                    cpuE = np.ones(shape, dtype=np.float32)

                else:
                    cpuX = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)

                x = tf.placeholder(tf.float32, cpuX.shape)
                e = tf.placeholder(tf.float32, cpuE.shape)
                feed_dict = {x: cpuX, e: cpuE}

                xf = ew.float_cast(x, dtype=tf.bfloat16)

                y = bst.masked_softmax(xf, scale=scale)

                y = ew.float_cast(y, dtype=tf.float32)

                dx, = tf.gradients(y, [x], e)

                y, dx = sess.run([y, dx], feed_dict)

                Y = bst.masked_softmax_test(cpuX, scale=scale)
                DX = bst.masked_softmax_grad_test(cpuE, Y, scale=scale)

                print("testBlocksparseSoftmax", bsize)
                for op, dev, cpu in [
                    [" Y", y, Y],
                    ["DX", dx, DX],
                ]:
                    self.compare_results(op, dev, cpu)
    def testMatMul(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            for shape in shapes:

                np.random.seed(int(time()))
                cpuX = np.random.normal(loc=0.1, scale=1.0, size=shape).astype(np.float16).astype(np.float32)
                cpuE = np.random.normal(loc=0.2, scale=1.0, size=shape).astype(np.float16).astype(np.float32)
                cpuU = np.dot(cpuX.astype(np.float64).T, cpuE.astype(np.float64)).astype(np.float32)

                for dtype in (tf.float32, tf.float16):  #tf.float16, tf.bfloat16

                    with tf.device("/gpu:0"):

                        x = tf.placeholder(tf.float32, cpuX.shape, name="x")
                        e = tf.placeholder(tf.float32, cpuE.shape, name="e")

                        feed_dict = { x : cpuX, e : cpuE }

                        if dtype is not tf.float32:
                            xf = ew.float_cast(x, dtype=dtype)
                            ef = ew.float_cast(e, dtype=dtype)
                        else:
                            xf, ef = x, e

                        u0 = dw_matmul_large_n(xf, ef)
                        u1 = tf.matmul(xf, ef, transpose_a=True, transpose_b=False)

                        if dtype is not tf.float32:
                            u1 = ew.float_cast(u1, dtype=tf.float32, dx_dtype=dtype)

                        u0, u1 = sess.run( [ u0, u1 ], feed_dict )

                    for op, dev, cpu in [
                        ("custom", u0, cpuU),
                        ("cublas", u1, cpuU),
                    ]:

                        dif     = np.abs(cpu - dev)
                        avgval  = np.average(abs(cpu))
                        maxdif  = dif.max()
                        max_err = maxdif if avgval == 0 else maxdif / avgval
                        l2_err  = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum())

                        print("%s, depth:%8d shape:%12s, op:%s, err:%17.12f, l2_err:%17.12f" % (dtype.name, shape[0], str(cpu.shape), op, maxdif, l2_err))
    def testBiasRelu(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

                for shapeX in shapes:
                    axis   = len(shapeX)-2
                    shapeY = list(shapeX)
                    shapeY[axis] = 1

                    np.random.seed(int(time()))
                    cpuX = np.random.uniform(-2**14, 2**14, shapeX).astype(np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-2**14, 2**14, shapeY).astype(np.float16).astype(np.float32)

                    for dtype in (tf.float16, ):  #tf.float16, tf.float32

                        results = []
                        for device in ("gpu", "cpu"):

                            cast = device == "gpu" and dtype is not tf.float32

                            with tf.device("/%s:0" % device), tf.name_scope(device):

                                x = tf.placeholder(tf.float32, cpuX.shape, name="x")
                                e = tf.placeholder(tf.float32, cpuE.shape, name="e")

                                feed_dict = { x : cpuX, e : cpuE }

                                xf = ew.float_cast(x, dtype=dtype) if cast else x

                                y = ew.reduce_max(xf, axis=axis, keepdims=True)

                                if cast:
                                    y = ew.float_cast(y, dtype=tf.float32)

                                dx, = tf.gradients(y, [x], e)

                                results.append( sess.run( [ y, dx ], feed_dict ) )

                        for op, dev, cpu in zip(["y", "dx"], results[0], results[1]):

                            dif     = np.abs(cpu - dev)
                            sum_err = (dif > .01).sum()
                            pct_err = 100*sum_err / cpu.size
                            l2_err  = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum())

                            print("%s, shape:%22s, op:%3s, sum_err: %4d, pct_err: %.4f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, sum_err, pct_err, l2_err))
Esempio n. 6
0
def grouped_lstm(inputs,
                 width,
                 timesteps,
                 initial_state,
                 scope="grouped_lstm",
                 reuse=None,
                 lstm_id=0,
                 layernorm=True):

    fp16 = inputs.dtype is tf.float16

    if layernorm:
        from blocksparse.norms import layer_norm
    if fp16:
        from blocksparse.ewops import float_cast

    in_width = inputs.shape[-1].value

    with tf.variable_scope(scope, reuse=reuse):
        w = tf.get_variable('kernel', shape=[in_width + width, 4 * width])
        b = tf.get_variable('bias', shape=[4 * width])
        if layernorm:
            g = tf.get_variable('gain', shape=[4 * width])

        c, h = initial_state

        if fp16:
            w = float_cast(w, dtype=tf.float16, dx_dtype=tf.float16)

        if timesteps > 1:
            inputs = [
                tf.squeeze(x) for x in tf.split(inputs, timesteps, axis=1)
            ]
        else:
            inputs = [tf.reshape(inputs, [-1, inputs.shape[-1].value])]

        outputs = []
        for t, x in enumerate(inputs):

            h = tf.matmul(tf.concat([x, h], 1),
                          w,
                          name="lstm_%02d/step_%04d" % (lstm_id, t))

            if layernorm:
                h = layer_norm(h, g, b, axis=1, segments=4)
                c, h = fused_lstm_gates(c, h, forget_bias=1.0)
            else:
                c, h = fused_lstm_gates(c, h, bias=b, forget_bias=1.0)

            outputs.append(h)

        output = tf.stack(outputs, axis=1)

    return output, [c, h]
Esempio n. 7
0
    def forward(self, inputs, ema=None):
        hps = self.hps
        bsmm = hps.bsmm
        xgroup = hps.x_group_size
        xgroups = len(inputs) // xgroup
        sproj = hps.sproj_out

        self.inputs = inputs

        if sproj is not None:
            inputs = [sproj.gather(h) for h in inputs]

        with tf.variable_scope(self.scope):

            w = hps.get_variable("w", bsmm["y"].w_shape, normal_initializer())
            g = hps.get_variable("g", [hps.nvocab], ones_initializer())
            b = hps.get_variable("b", [hps.nvocab], zeros_initializer())

            self.params = [w, g, b]

            if ema is not None:
                w = ema.average(w)
                g = ema.average(g)
                b = ema.average(b)

            #w = ew.float_cast(w, dtype=hps.dtype)
            w = bsmm["y"].l2_normalize(w, dtype=hps.dtype)

            # compute the fc matmul in groups for better memory efficiency.
            ygroups = []
            for i in range(xgroups):
                x = tf.concat(inputs[i * xgroup:(i + 1) * xgroup],
                              1 - hps.axis)

                # (nsteps x nbatch, nvocab) = (nsteps x nbatch, hidden) . (nhidden, nvocab)
                ygroups.append(bsmm["y"](x, w, dw_dtype=hps.dw_dtype))

            y = tf.concat(ygroups, 1 - hps.axis)

            # cast to float32 before entering cost function
            y = ew.float_cast(y, dtype=tf.float32, dx_dtype=hps.dx_dtype)

            if hps.axis == 0:
                y = tf.transpose(y)

            if (hps.nvocab % 32) != 0:
                y = tf.slice(y, [0, 0], [-1, hps.nvocab])

            self.outputs = y * g + b

            outputs = tf.stop_gradient(self.outputs)

        return outputs
Esempio n. 8
0
def print_act_stats(x, _str="", flatten=False):
    if False:
        return x
    _x = ew.float_cast(x, dtype=tf.float32)
    if flatten:
        _x = tf.reshape(_x, [-1])
    if len(_x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(_x, [0], keep_dims=True)
    if len(_x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(_x, [0], keep_dims=True)
    if len(_x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(_x, [0, 2, 3], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),\
            tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    __str = "[" + _str + "] " + x.name
    print(__str)
    return tf.Print(x, stats, __str)
Esempio n. 9
0
    def forward(self, x, ema=None):
        hps = self.hps

        assert hps.nsteps % hps.x_group_size == 0
        xgroups = hps.nsteps // hps.x_group_size

        with tf.variable_scope(self.scope):

            w = hps.get_variable("w", [hps.nvocab, hps.nembd],
                                 ortho_initializer())
            g = hps.get_variable("g", [hps.nvocab, 1], ones_initializer())

            self.params = [w, g]

            if ema is not None:
                w = ema.average(w)
                g = ema.average(g)

            w = tf.nn.l2_normalize(w, dim=1) * g

            # x (nsteps, nbatch)
            # w (nvocab, nembd)
            # o (nsteps, nbatch, nembd)
            words = tf.nn.embedding_lookup(w, x)
            if self.train and hps.dropout > 0 and hps.dropout_input > 0:
                words = tf.nn.dropout(words, 1. - hps.dropout,
                                      [hps.nsteps, hps.batch_size, 1])

            # potentially down cast to fp16 to save memory and speed things up
            #words = ew.float_cast(words, dtype=hps.dtype)

            # (x_group_size x nbatch, nembd) * xgroups
            outputs = [
                tf.reshape(x, [-1, hps.nembd])
                for x in tf.split(words, xgroups, 0)
            ]
            if hps.axis == 0:
                outputs = [tf.transpose(x) for x in outputs]

            self.outputs = [ew.float_cast(x, dtype=hps.dtype) for x in outputs]

            outputs = [tf.stop_gradient(x) for x in self.outputs]

        return outputs
Esempio n. 10
0
    def apply(self, params, qspec=None):

        with tf.device("/gpu:0"), tf.control_dependencies(None):
            for param in params:
                if self.fp16 == 2 or (self.fp16 and is_param_casted(param)):
                    # only use fp16 for params that are explicitly cast to fp16 before use
                    init = float_cast(param.initialized_value(),
                                      dtype=tf.float16)
                    dtype = tf.float16
                else:
                    init = param.initialized_value()
                    dtype = tf.float32

                with tf.variable_scope(None, param.op.name + "/" + self.name):
                    # use the Identity read op output as the key
                    # this lets us lookup ema vars by Cast op outputs
                    self.averages[param.value()] = tf.get_variable(
                        "ema", dtype=dtype, initializer=init, trainable=False)
                ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                                      param)

        ema_ops = []
        for param in params:

            ema = self.averages[param.value()]
            gate = getattr(param, "gate", None)
            gate = [gate] if self.gated and gate is not None else []

            op = ema_op(ema, param, gate, decay=self.decay)

            if qspec is not None:
                ema_ops.append(
                    ema.assign(quantize(op, qspec,
                                        name="ema_" + param.op.name)))
            else:
                ema_ops.append(op)

        return tf.group(*ema_ops)
Esempio n. 11
0
    def forward(self, inputs, states, ema=None):

        hps = self.hps
        bsmm = hps.bsmm

        with tf.variable_scope(self.scope) as scope:

            self.param_names = ['xi', 'xf', 'xo', 'xu', 'hi', 'hf', 'ho', 'hu']
            self.params = dict()

            for p in self.param_names:

                if 'x' in p:
                    bsmm_p, size = (bsmm.x, hps.nproj_in)

                elif 'h' in p:
                    bsmm_p, size = (bsmm.h, hps.nhidden)

                b_init = ones_initializer(
                    hps.forget_bias) if p == 'hf' else zeros_initializer()

                w = hps.get_variable("w_" + p, bsmm_p.w_shape,
                                     bsmm_p.identity_init())
                g = hps.get_variable("g_" + p, [size], ones_initializer())
                b = hps.get_variable("b_" + p, [size], b_init)

                if ema is not None:
                    w = ema.average(w)
                    g = ema.average(g)
                    b = ema.average(b)

                wc = ew.float_cast(w, dtype=hps.dtype)

                self.params[p] = (wc, g, b, w)

            c, h = tf.unstack(states, num=2)
            c = ew.float_cast(c, dtype=hps.dtype)
            h = ew.float_cast(h, dtype=hps.dtype)

            xi_w, xi_g, xi_b = self.params["xi"][0:3]
            xf_w, xf_g, xf_b = self.params["xf"][0:3]
            xo_w, xo_g, xo_b = self.params["xo"][0:3]
            xu_w, xu_g, xu_b = self.params["xu"][0:3]

            self.inputs = inputs
            self.outputs = []
            self.segments = []

            for xgroup in inputs:

                if hps.recompute and self.train:
                    # We compute gradient one segment at a time, so prevent tf.gradients from going too far.
                    # We also want to add control inputs to the start of the segment so having wrappers
                    # around the segment inputs is handy.
                    seg = [(tf.stop_gradient(c), tf.stop_gradient(h))]
                    self.segments.append(seg)

                # delay input expansion to just prior to use (saves memory)
                with tf.control_dependencies([h]):
                    xwi = bsmm.x(xgroup, xi_w, dw_dtype=hps.dw_dtype)
                    xwf = bsmm.x(xgroup, xf_w, dw_dtype=hps.dw_dtype)
                    xwo = bsmm.x(xgroup, xo_w, dw_dtype=hps.dw_dtype)
                    xwu = bsmm.x(xgroup, xu_w, dw_dtype=hps.dw_dtype)

                xwi = tf.split(xwi, hps.x_group_size, 1 - hps.axis)
                xwf = tf.split(xwf, hps.x_group_size, 1 - hps.axis)
                xwo = tf.split(xwo, hps.x_group_size, 1 - hps.axis)
                xwu = tf.split(xwu, hps.x_group_size, 1 - hps.axis)

                masks = []
                for xi, xf, xo, xu in zip(xwi, xwf, xwo, xwu):
                    xi = layer_norm(xi, xi_g, xi_b, axis=hps.axis)
                    xf = layer_norm(xf, xf_g, xf_b, axis=hps.axis)
                    xo = layer_norm(xo, xo_g, xo_b, axis=hps.axis)
                    xu = layer_norm(xu, xu_g, xu_b, axis=hps.axis)

                    c, h, mask = self.cell(c, h, xi, xf, xo, xu)
                    _masks = [mask]
                    for _ in range(1, hps.lsteps):
                        c, h, mask = self.cell(c, h, None, None, None, None)
                        _masks.append(mask)
                    masks.append(_masks)

                    self.outputs.append(h)

                if hps.recompute and self.train:
                    with tf.name_scope("f_seg_%04d_%d" %
                                       (len(self.segments) - 1, len(seg) - 1)):

                        c_seg, h_seg = seg[0]

                        with tf.control_dependencies([h_seg]):
                            xwi = bsmm.x(xgroup, xi_w, dw_dtype=hps.dw_dtype)
                            xwf = bsmm.x(xgroup, xf_w, dw_dtype=hps.dw_dtype)
                            xwo = bsmm.x(xgroup, xo_w, dw_dtype=hps.dw_dtype)
                            xwu = bsmm.x(xgroup, xu_w, dw_dtype=hps.dw_dtype)

                        xwi = tf.split(xwi, hps.x_group_size, 1 - hps.axis)
                        xwf = tf.split(xwf, hps.x_group_size, 1 - hps.axis)
                        xwo = tf.split(xwo, hps.x_group_size, 1 - hps.axis)
                        xwu = tf.split(xwu, hps.x_group_size, 1 - hps.axis)

                        for xi, xf, xo, xu, mask in zip(
                                xwi, xwf, xwo, xwu, masks):
                            xi = layer_norm(xi, xi_g, xi_b, axis=hps.axis)
                            xf = layer_norm(xf, xf_g, xf_b, axis=hps.axis)
                            xo = layer_norm(xo, xo_g, xo_b, axis=hps.axis)
                            xu = layer_norm(xu, xu_g, xu_b, axis=hps.axis)

                            c_seg, h_seg, _ = self.cell(
                                c_seg, h_seg, xi, xf, xo, xu, mask[0])
                            for i in range(1, hps.lsteps):
                                c_seg, h_seg, _ = self.cell(
                                    c_seg, h_seg, None, None, None, None,
                                    mask[i])

                            seg.append((c_seg, h_seg))

            c = ew.float_cast(c, dtype=tf.float32)
            h = ew.float_cast(h, dtype=tf.float32)
            states = tf.stack([c, h], 0)

            # We calculate the gradient internally.
            # Don't let other layer's gradients flow into here.
            # This is possible because the last cell has free c and h
            # params that are popluated with zeros in the gradients pass.
            outputs = [tf.stop_gradient(x) for x in self.outputs]

        return outputs, states
    def testTopK(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            for shape in shapes:

                topK = shape[-1] // 4 # 25% sparsity

                np.random.seed(int(time()))
                cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
                cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)

                X = tf.placeholder(tf.float32, cpuX.shape)
                E = tf.placeholder(tf.float32, cpuE.shape)

                for mask_dims in (0, 2, 3):

                    if mask_dims == 0:
                        mask = M = m_shape = None
                        feed_dict = { X: cpuX, E: cpuE }

                    else:
                        m_shape = [1 for n in shape]
                        m_shape[-mask_dims:] = shape[-mask_dims:]
                        mask = np.zeros(m_shape, dtype=np.float32)

                        if mask_dims == 2:
                            for y, x in np.ndindex(mask.shape[-2:]):
                                if x <= y: mask[:,:,y,x] = 3.0
                        elif mask_dims == 3:
                            for z, y, x in np.ndindex(mask.shape[-3:]):
                                if x <= y: mask[:,z,y,x] = (z+1)*3.0

                        M = tf.placeholder(tf.float32, mask.shape)
                        feed_dict = { X: cpuX, E: cpuE, M: mask }

                    for dtype in (tf.float32, ):  #tf.float16, tf.bfloat16

                        rtol = 1e-4 if dtype is tf.float32 else 1e-1

                        Y = ew.float_cast(X, dtype=dtype)

                        #Y = trans.masked_top_k_softmax(Y, topK, mask=M, scale=2.0)

                        Y = trans.masked_softmax(Y, mask=M, scale=2.0, bench=bench)

                        Y = ew.float_cast(Y, dtype=tf.float32, dx_dtype=dtype)
                        D = tf.gradients(Y, [X], E)

                        #devY, = sess.run( [Y], feed_dict)
                        devY, (devDX,) = sess.run( [Y, D], feed_dict)
                        #devY, (devDX,), tfY = sess.run( [Y, D, tf.nn.top_k(X, topK)], feed_dict)

                        # gradient_checker tests are insanely slow
                        # if True:
                        #     x = tf.constant(cpuX)
                        #     m = tf.constant(mask)
                        #     y = trans.masked_top_k_softmax(x, topK, mask=m)

                        # error = gradient_checker.compute_gradient_error(x, shape, y, shape) #, extra_feed_dict={ x: cpuX, m: mask }
                        # assert error < 0.01, error

                        if bench == 0:

                            # cpuY  = trans.masked_top_k_softmax_test(cpuX, topK, mask=mask, scale=2.0)
                            # cpuDX = trans.masked_softmax_grad_test(cpuE, cpuY, mask=mask, scale=2.0)

                            cpuY  = trans.masked_softmax_test(cpuX, mask=mask, scale=2.0)
                            cpuDX = trans.masked_softmax_grad_test(cpuE, cpuY, mask=mask, scale=2.0)
                            difY  = np.abs(cpuY -  devY)
                            difDX = np.abs(cpuDX - devDX)
                            cntY  = (difY  > rtol).astype(np.int).sum() / difY.size
                            cntDX = (difDX > rtol).astype(np.int).sum() / difDX.size

                            print("%s, shape:%18s, mask:%18s, errY:%.5f, errDX:%.5f" % (dtype.name, str(shape), str(m_shape), cntY, cntDX))

                            if out:
                                np.savetxt( "cpuY.txt",  cpuY.reshape(-1,shape[-1]), fmt="%6.3f")
                                np.savetxt( "devY.txt",  devY.reshape(-1,shape[-1]), fmt="%6.3f")
                                np.savetxt("cpuDX.txt", cpuDX.reshape(-1,shape[-1]), fmt="%6.3f")
                                np.savetxt("devDX.txt", devDX.reshape(-1,shape[-1]), fmt="%6.3f")
                                np.savetxt("difDX.txt", difDX.reshape(-1,shape[-1]), fmt="%6.3f")
Esempio n. 13
0
    def testCWiseLinear(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)
        with self.test_session(config=config) as sess:

            for shape in (shapes):

                bshape    = [1] * len(shape)
                bshape[1] = shape[1]

                if ones:
                    cpuX = np.ones(shape,  dtype=np.float32)
                    cpuE = np.ones(shape,  dtype=np.float32)
                    cpuG = np.ones(bshape, dtype=np.float32)
                    cpuB = np.ones(bshape, dtype=np.float32)
                else:
                    np.random.seed(int(time()))
                    cpuX = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
                    cpuG = np.random.uniform(-1.0, 1.0, bshape).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0, bshape).astype(np.float32)

                for dtype in (tf.float32, tf.float16, ):  # tf.float32, tf.float16, tf.bfloat16
                    relus = (True, False) if dtype is tf.float32 else (False,)
                    for relu in relus:

                        results = []
                        for device in ("gpu", "cpu"):

                            cast = device == "gpu" and dtype is not tf.float32

                            with tf.device("/%s:0" % device), tf.name_scope(device):

                                x = tf.placeholder(tf.float32, cpuX.shape, name="x")
                                e = tf.placeholder(tf.float32, cpuE.shape, name="e")
                                g = tf.placeholder(tf.float32, cpuG.shape, name="g")
                                b = tf.placeholder(tf.float32, cpuB.shape, name="b")

                                feed_dict = {
                                    x : cpuX,
                                    e : cpuE,
                                    g : cpuG,
                                    b : cpuB,
                                }

                                xf = float_cast(x, dtype=dtype) if cast else x

                                y0 = cwise_linear(xf, gain=g, bias=b, relu=relu)
                                y1 = cwise_linear(xf, gain=g,         relu=relu)
                                y2 = cwise_linear(xf,         bias=b, relu=relu)

                                if cast:
                                    y0 = float_cast(y0, dtype=tf.float32)
                                    y1 = float_cast(y1, dtype=tf.float32)
                                    y2 = float_cast(y2, dtype=tf.float32)

                                dx0, dg0, db0 = tf.gradients(y0, [ x, g, b ], e)
                                dx1, dg1      = tf.gradients(y1, [ x, g    ], e)
                                dx2,      db2 = tf.gradients(y2, [ x,    b ], e)

                                results.append( sess.run( [ y0, y1, y2, dx0, dg0, db0, dx1, dg1, dx2, db2 ], feed_dict ) )
                                labels = ["y0", "y1", "y2", "dx0", "dg0", "db0", "dx1", "dg1", "dx2", "db2"]

                        for op, dev, cpu in zip(labels, results[0], results[1]):

                            dif     = np.abs(cpu - dev)
                            avgval  = np.average(abs(cpu))
                            maxdif  = dif.max()
                            max_err = maxdif if avgval == 0 else maxdif / avgval
                            l2_err  = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum())

                            print("%s, shape:%16s, op: %3s, relu:%d, err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, int(relu), max_err, l2_err))
Esempio n. 14
0
def fp32(x):
    return float_cast(x, dtype=tf.float32)
Esempio n. 15
0
    def testEdgeBias(self):

        config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            test = 0
            for N, K, RS, HW, strides in shapes:
                test += 1
                PQ = [ceil_div(x, std) for x, std in zip(HW, strides)]

                for layout in (
                        "NCHW",
                        "NHWC",
                ):  # "NCHW","NHWC"

                    if layout == "NHWC":
                        y_shape = [N] + PQ + [K]
                        x_shape = [N] + HW + [K]
                        w_shape = RS + [K, K]
                    else:
                        y_shape = [N] + [K] + PQ
                        x_shape = [N] + [K] + HW
                        w_shape = [K, K] + RS

                    eb = ConvEdgeBias(y_shape,
                                      x_shape,
                                      w_shape,
                                      strides=strides,
                                      data_format=layout)

                    if ones:
                        cpuX = np.ones(y_shape).astype(np.float32)
                        cpuE = np.ones(y_shape).astype(np.float32)
                        cpuG = np.ones(eb.shape).astype(np.float32)
                        cpuB = np.ones(eb.shape).astype(np.float32)
                    else:
                        cpuX = np.random.uniform(-1.0, 1.0,
                                                 y_shape).astype(np.float32)
                        cpuE = np.random.uniform(-1.0, 1.0,
                                                 y_shape).astype(np.float32)
                        cpuG = np.random.uniform(-1.0, 1.0,
                                                 eb.shape).astype(np.float32)
                        cpuB = np.random.uniform(-1.0, 1.0,
                                                 eb.shape).astype(np.float32)

                    x = tf.placeholder(tf.float32, cpuX.shape)
                    e = tf.placeholder(tf.float32, cpuE.shape)
                    g = tf.placeholder(tf.float32, cpuG.shape)
                    b = tf.placeholder(tf.float32, cpuB.shape)

                    feed_dict = {x: cpuX, e: cpuE, g: cpuG, b: cpuB}

                    for dtype in (tf.float32,
                                  ):  # tf.float32, tf.float16, tf.bfloat16

                        xf = ew.float_cast(x, dtype=dtype)
                        y = eb(xf, g, b, bench=bench)
                        y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype)

                        devY, (devDX, devDG, devDB) = sess.run(
                            [y, tf.gradients(y, [x, g, b], e)], feed_dict)

                        if bench == 0:

                            cpuY = eb.edge_bias_test(cpuX, cpuG, cpuB)
                            cpuDX, cpuDG, cpuDB = eb.edge_bias_grad_test(
                                cpuE, cpuX, cpuG)

                            for op, devT, cpuT in (
                                (" devY", devY, cpuY),
                                ("devDX", devDX, cpuDX),
                                ("devDG", devDG, cpuDG),
                                ("devDB", devDB, cpuDB),
                            ):

                                devT = np.array(devT)
                                difA = cpuT - devT

                                avgval = abs(cpuT).sum() / cpuT.size
                                maxdif = abs(difA).max()
                                ratio = maxdif / avgval

                                print(
                                    "%8s, test:%2d layout: %s op:%s err:%17.12f"
                                    % (dtype.name, test, layout, op, ratio))
    def atestBlocksparseMatMulGated(self):

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            N = 128
            K = 8 * 56 * 2 * 4
            n = K // 8
            m = 30
            dtype = tf.bfloat16
            repeat = 10000

            layout = networkx.generators.barabasi_albert_graph(n, m)
            layout = networkx.adjacency_matrix(layout).toarray().astype(
                np.int32) + np.eye(n, dtype=np.int32)
            layout[0:m, 0:m] = 1

            blocks = layout.sum()
            n = layout.shape[0]
            print(100 * blocks / n**2)
            print(layout.sum(axis=0).max())

            # layout = np.ones((112,32), dtype=np.int32)
            bsmm = BlocksparseMatMul(layout,
                                     block_size=8,
                                     feature_axis=0,
                                     name="test")

            if one:
                X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                W = np.ones(bsmm.w_shape, dtype=np.float32)
                G = np.ones(bsmm.blocks, dtype=np.float32)
            else:
                X = np.random.uniform(-1.0, 1.0,
                                      bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0,
                                      bsmm.o_shape(N)).astype(np.float32)
                W = np.random.uniform(-1.0, 1.0,
                                      bsmm.w_shape).astype(np.float32)
                G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(np.float32)

            G = np.ones(bsmm.blocks, dtype=np.float32)
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     G[w] = (c & 1) ^ (k & 1) ^ 1

            #G[::2] = 0.0

            # block = dict()
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     block[(c,k)] = w

            # grid = []
            # for c in range(bsmm.CB):
            #     row = []
            #     for k in range(bsmm.KB):
            #         row.append(G[block[(c,k)]])
            #     grid.append(row)

            # for row in grid:
            #     print(row)

            # exit()

            x = tf.constant(X)
            e = tf.constant(E)
            w = tf.constant(W)
            g = tf.constant(G)

            w2 = ew.float_cast(w, dtype=dtype)
            y = ew.float_cast(x, dtype=dtype)

            y = bsmm(y, w2, gate=g, bench=repeat)

            y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype)

            d = tf.gradients(y, [x, w], e)

            y, (dx, dw) = sess.run([y, d])

            # gpu kernel doesn't touch zero gate blocks
            # for b in range(bsmm.blocks):
            #     if G[b] == 0.0:
            #         dw[b,:,:] = 0.0

            Y = bsmm.fprop_test(X, W, gate=G)
            DX = bsmm.bprop_test(E, W, gate=G)
            DW = bsmm.updat_test(X, E, gate=G)

            #print(Y.shape, dtype)

            for op, cpuA, devA in (
                (" y:", Y, y),
                ("dx:", DX, dx),
                ("dw:", DW, dw),
            ):

                difA = abs(cpuA - devA)

                avgval = np.average(abs(cpuA))
                maxdif = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(
                    np.square(cpuA).sum() + 1e-12)

                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                      (op, 100 * max_err, l2_err))

                if out:
                    dim = K if op == "dw:" else N
                    np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f')
                    np.savetxt("outC.txt",
                               cpuA.reshape((-1, dim)),
                               fmt='%5.1f')
                    np.savetxt("outD.txt",
                               devA.reshape((-1, dim)),
                               fmt='%5.1f')
                    exit()
    def testBlocksparseMatMul(self):

        # layout = np.zeros((2,2), dtype=np.int32)
        # layout[0,0] = 1

        n, m = 56 * 8, 8
        layout = networkx.generators.barabasi_albert_graph(n, m)
        #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5)
        layout = networkx.adjacency_matrix(layout).toarray().astype(
            np.int32) + np.eye(n, dtype=np.int32)
        layout[0:m, 0:m] = 1

        #layout[0:60,0:60] = 1
        #layout = np.zeros((4,4), dtype=np.int32)
        #layout = np.ones((28*12,28*12), dtype=np.int32)
        #layout[0,0] = 1

        blocks = layout.sum()
        n = layout.shape[0]
        print(100 * blocks / n**2)
        print(layout.sum(axis=0).max())
        #exit()

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            for bsize, axis in (
                (32, 1),
                (32, 0),
                (16, 0),
                (8, 0),
            ):  # (32,1), (32,0), (16,0), (8,0)

                bsmm = BlocksparseMatMul(layout,
                                         block_size=bsize,
                                         feature_axis=axis,
                                         name="test")

                if one:
                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    #W[:] += np.arange(8, dtype=np.float32).reshape(1,8)
                else:
                    W = np.random.uniform(-1.0, 1.0,
                                          bsmm.w_shape).astype(np.float32)

                # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32)
                # for w, (c, k) in enumerate(bsmm.updat_list):
                #     WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:]

                w = tf.constant(W)

                # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) )
                # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape)
                # print("identity_init: ", (s1 - s2).max())

                for N in (64, ):  # 128,64,32,16,1,

                    if one:
                        X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                        E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                        #X[:] += np.arange(8, dtype=np.float32).reshape(8,1)
                    else:
                        X = np.random.uniform(
                            -1.0, 1.0, bsmm.i_shape(N)).astype(np.float32)
                        E = np.random.uniform(
                            -1.0, 1.0, bsmm.o_shape(N)).astype(np.float32)

                    x = tf.constant(X)
                    e = tf.constant(E)

                    for dtF, dtB in dtypes:

                        print("Axis:%d Bsize:%2d N:%d F:%s B:%s Params:%d" %
                              (axis, bsize, N, dtF.name, dtB.name,
                               bsize * bsize * blocks))

                        # compute in tensorflow
                        if l2norm:
                            w2 = bsmm.l2_normalize(w, dtype=dtF)
                        else:
                            w2 = ew.float_cast(w, dtype=dtF)

                        y = ew.float_cast(x, dtype=dtF)

                        for j in range(depth):
                            repeat = bench if bench and j == depth - 1 else 0
                            y = bsmm(
                                y, w2, dw_dtype=dtF, bench=repeat
                            )  # (bench and j==depth-1) (bench and j==0)

                        y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtB)
                        if bench: sess.run(y)
                        #y = sess.run( y )

                        d = tf.gradients(y, [x, w], e, aggregation_method=am)
                        if depth > 1:
                            d[1] = group_param_grads(d[1], 8)

                        y, (dx, dw) = sess.run([y, d])

                        if not bench:
                            # compute in numpy
                            if l2norm:
                                W2 = bsmm.l2_normalize_test(W)
                            else:
                                W2 = W

                            # YY = np.dot(WW.T, X)
                            # ZZ = np.dot(WW  , E)
                            # uu = np.dot( X  , E.T)
                            # UU = np.zeros(bsmm.w_shape, dtype=np.float32)
                            # for w, (c, k) in enumerate(bsmm.updat_list):
                            #     UU[w,:,:] = uu[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize]

                            Ys = [X]
                            for j in range(depth):
                                Ys.append(bsmm.fprop_test(Ys[-1], W2))
                            Y = Ys.pop()

                            DW = np.zeros(bsmm.w_shape, dtype=np.float32)
                            DX = E
                            for j in range(depth):
                                DW += bsmm.updat_test(Ys.pop(), DX)
                                DX = bsmm.bprop_test(DX, W2)
                            if l2norm:
                                DW = bsmm.l2_normalize_grad_test(W, DW)

                            for op, cpuA, devA in (
                                    # ("YY:", YY,  y),
                                    # ("ZZ:", ZZ, dx),
                                    # ("UU:", UU, dw),
                                (" y:", Y, y),
                                ("dx:", DX, dx),
                                ("dw:", DW, dw),
                            ):

                                difA = abs(cpuA - devA)

                                avgval = np.average(abs(cpuA))
                                maxdif = difA.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval

                                l2_err = np.sqrt(
                                    np.square(difA).sum()) / np.sqrt(
                                        np.square(cpuA).sum())

                                #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                                      (op, 100 * max_err, l2_err))

                                # rtol = 1e-4 if dtF is tf.float32 else 1e-1
                                # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                                if out:
                                    dim = bsmm.K if op == "dw:" else N
                                    np.savetxt("out.txt",
                                               difA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outC.txt",
                                               cpuA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outD.txt",
                                               devA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    exit()
                            print("")
    def testLSTMGates(self):

        config = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            for shape1 in shapes:
                shape4 = [shape1[0], shape1[1]*4]

                for dtype in (tf.float32, tf.float16):  #tf.float16, tf.bfloat16

                    np.random.seed(int(time()))
                    cpuC = np.random.uniform(-1.0, 1.0, shape1    ).astype(np.float32)
                    cpuH = np.random.uniform(-1.0, 1.0, shape4    ).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape1    ).astype(np.float32)
                    cpuB = np.random.uniform(-1.0, 1.0, shape4[1:]).astype(np.float32)
                    cpuG = np.random.uniform(-1.0, 1.0, shape4[1:]).astype(np.float32)

                    results = []
                    for device in ("gpu", "cpu"):

                        with tf.device("/%s:0" % device), tf.name_scope(device):

                            c = tf.placeholder(tf.float32, cpuC.shape, name="c")
                            h = tf.placeholder(tf.float32, cpuH.shape, name="h")
                            e = tf.placeholder(tf.float32, cpuE.shape, name="e")
                            b = tf.placeholder(tf.float32, cpuB.shape, name="b")
                            g = tf.placeholder(tf.float32, cpuB.shape, name="g")

                            feed_dict = {
                                c : cpuC,
                                h : cpuH,
                                e : cpuE,
                                b : cpuB,
                                g : cpuG,
                            }

                            if device == "gpu" and dtype is not tf.float32:
                                cf = ew.float_cast(c, dtype=dtype)
                                hf = ew.float_cast(h, dtype=dtype)
                            else:
                                cf, hf = c, h

                            if layernorm:
                                hf = norms.layer_norm(hf, g, b, axis=1, segments=4)
                                bias = None
                            else:
                                bias = b

                            cf, hf = lstm.fused_lstm_gates(cf, hf, bias=bias, forget_bias=1.0)

                            if device == "gpu" and dtype is not tf.float32:
                                cf = ew.float_cast(cf, dtype=tf.float32, dx_dtype=dtype)
                                hf = ew.float_cast(hf, dtype=tf.float32, dx_dtype=dtype)

                            if layernorm:
                                dc, dh, dg, db = tf.gradients([cf, hf], [c, h, g, b], [None, e])
                                results.append( sess.run( [ cf, hf, dc, dh, dg, db ], feed_dict ) )
                                labels = [" c", " h", "dc", "dh", "dg", "db"]
                            else:
                                dc, dh, db = tf.gradients([cf, hf], [c, h, b], [None, e])
                                results.append( sess.run( [ cf, hf, dc, dh, db ], feed_dict ) )
                                labels = [" c", " h", "dc", "dh", "db"]


                    for op, dev, cpu in zip(labels, results[0], results[1]):

                        dif     = np.abs(cpu - dev)
                        avgval  = np.average(abs(cpu))
                        maxdif  = dif.max()
                        max_err = maxdif if avgval == 0 else maxdif / avgval
                        l2_err  = np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum())


                        print("%s, shape:%12s, op:%s, err:%17.12f, l2_err:%17.12f" % (dtype.name, str(cpu.shape), op, maxdif, l2_err))
    def testEwOps(self):

        with self.test_session() as sess, tf.device("/gpu:0"):

            for shape in ((32, 1024), ):  # (31,31*4), (11,1023), (33,33),
                for dtypeF, dtypeB in (
                    (np.float16, np.float16), (np.float32, np.float32)
                ):  #, (np.float32, np.float32), (np.float16, np.float16), (np.float16, np.float32),
                    dtypeF = np.dtype(dtypeF)  # Forward
                    dtypeB = np.dtype(dtypeB)  # Backwards

                    rtol = 1e-4 if dtypeF.type is np.float32 else 1e-1

                    with tf.name_scope("S%dx%dF%dB%d" %
                                       (shape[0], shape[1], dtypeF.itemsize,
                                        dtypeB.itemsize)):

                        if ones:
                            np_X = np.ones(shape, dtype=np.float32)
                            np_Y = np.ones(shape, dtype=np.float32)
                            np_E = np.ones(shape, dtype=np.float32)
                            np_B = np.ones((1, shape[1]), dtype=np.float32)
                        else:
                            # np_X = np.random.normal(0.0, 10.0, shape).astype(dtypeF).astype(np.float32)
                            # np_E = np.random.normal(0.0, 10.0, shape).astype(dtypeF).astype(np.float32)
                            # np_X.fill(10.0)

                            np_X = np.random.uniform(
                                0.01, 1.0,
                                shape).astype(dtypeF).astype(np.float32)
                            np_Y = np.random.uniform(
                                0.01, 1.0,
                                shape).astype(dtypeF).astype(np.float32)
                            np_E = np.random.uniform(
                                0.01, 1.0,
                                shape).astype(dtypeB).astype(np.float32)
                            np_B = np.random.uniform(
                                0.01, 1.0, (1, shape[1])).astype(np.float32)

                        x = tf.constant(np_X.astype(dtypeF))
                        y = tf.constant(np_Y.astype(dtypeF))
                        e = tf.constant(np_E.astype(dtypeB))
                        b = tf.constant(np_B)

                        X = tf.constant(np_X)
                        Y = tf.constant(np_Y)
                        E = tf.constant(np_E)
                        B = tf.constant(np_B)

                        tests = list()

                        # xx = tf.ones(shape, dtype=tf.float32)
                        # ee = tf.ones(shape, dtype=tf.float32)
                        # ew_op1 = ew.dropout(xx, keep_prob=0.5,  scale=2.0)
                        # ew_op2 = ew.dropout(xx, mask=ew_op1[1], scale=2.0)
                        # dx_op  = tf.gradients(ew_op1[0], [xx], ee)
                        # (z1, m), z2, (dx,) = sess.run( [ew_op1, ew_op2, dx_op] )
                        # #print(dx[0,0:8])
                        # print(z1.sum()/z1.size, dx.sum()/dx.size, (z1 - z2).sum(), (z1 - dx).sum())

                        # z = sess.run( ew.sparse_relu(x) )
                        # Z = ew.sparse_relu_test(np_X)
                        # tests.append(("sps_relu: Z ",  Z,  z))

                        # Non-Broadcast Binary Ops
                        for name, tf_op, ew_op in (
                            ("     add", tf.add, ew.add),
                            ("     mul", tf.multiply, ew.multiply),
                            ("     sub", tf.subtract, ew.subtract),
                            ("     div", tf.divide, ew.divide),
                            ("     max", tf.maximum, ew.maximum),
                            ("     min", tf.minimum, ew.minimum),
                        ):

                            # I think tf doesn't use fmaxf/fminf and hence has different behaviour for equal numbers.
                            # In fp32 the chance for equality is very small, but not so in fp16
                            if name[-3:] in ("max", "min"
                                             ) and dtypeF.type is np.float16:
                                continue

                            tf_op = tf_op(X, Y)
                            ew_op = ew_op(x, y)
                            Z, z = sess.run([tf_op, ew_op])
                            DX, DY = sess.run(tf.gradients(tf_op, [X, Y], E))
                            dx, dy = sess.run(tf.gradients(ew_op, [x, y], e))
                            tests.append((name + ": Z ", Z, z))
                            tests.append((name + ": DX", DX, dx))
                            tests.append((name + ": DY", DY, dy))

                        for name, tf_op, ew_op in (("   add_n", tf.add_n,
                                                    ew.add_n8_op), ):

                            tf_op2 = tf_op([X, Y])
                            ew_op2 = ew_op([x, y])
                            tf_op3 = tf_op([X, Y, E])
                            ew_op3 = ew_op([x, y, e])
                            Z2, z2 = sess.run([tf_op2, ew_op2])
                            Z3, z3 = sess.run([tf_op3, ew_op3])
                            tests.append((name + ": Z2", Z2, z2))
                            tests.append((name + ": Z3", Z3, z3))

                        # Unary Ops
                        for name, tf_op, ew_op in (
                            ("      sig", tf.sigmoid, ew.sigmoid),
                            ("     tanh", tf.tanh, ew.tanh),
                            (
                                "      neg",
                                tf.negative,
                                ew.negative,
                            ),
                            (
                                "      rcp",
                                tf.reciprocal,
                                ew.reciprocal,
                            ),
                            (
                                "      sqr",
                                tf.square,
                                ew.square,
                            ),
                            (
                                "     sqrt",
                                tf.sqrt,
                                ew.sqrt,
                            ),
                            (
                                "      exp",
                                tf.exp,
                                ew.exp,
                            ),
                            (
                                "      log",
                                tf.log,
                                ew.log,
                            ),
                            (
                                "     relu",
                                tf.nn.relu,
                                ew.relu,
                            ),
                            (
                                "      elu",
                                tf.nn.elu,
                                ew.elu,
                            ),
                            (
                                "     gelu",
                                gelu,
                                ew.gelu,
                            ),
                            (
                                "    swish",
                                swish,
                                ew.swish,
                            ),
                            (
                                "fast_gelu",
                                fast_gelu,
                                ew.fast_gelu,
                            ),
                        ):

                            tf_op = tf_op(X)
                            ew_op = ew_op(x)
                            Z, z = sess.run([tf_op, ew_op])
                            DX, = sess.run(tf.gradients(tf_op, [X], E))
                            dx, = sess.run(tf.gradients(ew_op, [x], e))
                            tests.append((name + ": Z ", Z, z))
                            tests.append((name + ": DX", DX, dx))

                        # Broadcast Binary Ops
                        for name, tf_op, ew_op in (
                            (
                                "bias_add",
                                tf.add,
                                ew.add,
                            ),
                            ("bias_mul", tf.multiply, ew.multiply),
                        ):

                            tf_op = tf_op(X, B)
                            ew_op = ew_op(x, b)
                            Z, z = sess.run([tf_op, ew_op])
                            DX, DB = sess.run(tf.gradients(tf_op, [X, B], E))
                            dx, db = sess.run(tf.gradients(ew_op, [x, b], e))
                            tests.append((name + ": Z ", Z, z))
                            tests.append((name + ": DX", DX, dx))
                            tests.append((name + ": DB", DB, db))

                        # Up Cast
                        ew_op = ew.float_cast(x,
                                              dtype=tf.float32,
                                              dx_dtype=dtypeB.type)
                        z = sess.run(ew_op)
                        dx, = sess.run(tf.gradients(ew_op, [x], e))
                        tests.append(("  upCast: Z ", np_X, z))
                        tests.append(("  upCast: DX", np_E, dx))

                        #Down Cast
                        if dtypeF.type is np.float32:
                            Z = np_X.astype(np.float16)
                            DX = np_E.astype(np.float16)
                            e16 = tf.constant(DX)
                            ew_op = ew.float_cast(x, dtype=tf.float16)
                            z = sess.run(ew_op)
                            dx, = sess.run(tf.gradients(ew_op, [x], e16))
                            tests.append(("downCast: Z ", Z, z))
                            tests.append(("downCast: DX", DX, dx))

                        for op, tfT, ewT in (tests):

                            dif = tfT - ewT

                            avgval = abs(tfT).sum() / tfT.size
                            maxdif = abs(dif).max()
                            ratio = maxdif / avgval

                            print(
                                "dtypeF:f%d, dtypeB:f%d, shape:%s, op:%s err:%17.12f"
                                % (dtypeF.itemsize, dtypeB.itemsize,
                                   str(shape), op, ratio))

                            # print(ewT[0,0,:,:])
                            # print(tfT[0,0,:,:])
                            # exit()

                            if out:  # and ratio > 1.0:
                                np.savetxt("out.txt", dif, fmt='%5.2f')
                                np.savetxt("outC.txt", tfT, fmt='%5.2f')
                                np.savetxt("outD.txt", ewT, fmt='%5.2f')
                                exit()
    def testBlocksparseTransformerDense(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for bsize in (16, 32, 64):

                layout = np.ones([heads, ctx, ctx], dtype=np.bool)
                bst = trans.BlocksparseTransformer(layout, block_size=bsize)

                shape = (batch, ctx * bsize, heads * state)

                if ones:
                    cpuQ = np.ones(shape, dtype=np.float32)
                    cpuK = np.ones(shape, dtype=np.float32)
                    cpuV = np.ones(shape, dtype=np.float32)
                    cpuE = np.ones(shape, dtype=np.float32)
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuV = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, shape)
                k = tf.placeholder(tf.float32, shape)
                v = tf.placeholder(tf.float32, shape)
                e = tf.placeholder(tf.float32, shape)

                feed_dict = {q: cpuQ, k: cpuK, v: cpuV, e: cpuE}

                qf = ew.float_cast(q, dtype=tf.float16)
                kf = ew.float_cast(k, dtype=tf.float16)
                vf = ew.float_cast(v, dtype=tf.float16)

                w = bst.query_key_op(qf, kf)
                w = bst.softmax(w, scale=scale)
                y = bst.weight_value_op(w, vf)

                qf = trans.transpose_0213(
                    tf.reshape(qf, [batch, ctx * bsize, heads, state]))
                kf = trans.transpose_0213(
                    tf.reshape(kf, [batch, ctx * bsize, heads, state]))
                vf = trans.transpose_0213(
                    tf.reshape(vf, [batch, ctx * bsize, heads, state]))
                W = tf.matmul(qf, kf, transpose_b=True)
                W = trans.softmax(W, scale=scale)
                Y = tf.matmul(W, vf)
                Y = tf.reshape(trans.transpose_0213(Y),
                               [batch, ctx * bsize, heads * state])

                y = ew.float_cast(y, dtype=tf.float32)
                Y = ew.float_cast(Y, dtype=tf.float32)

                y, (dq, dk,
                    dv) = sess.run([y, tf.gradients(y, [q, k, v], e)],
                                   feed_dict)
                Y, (DQ, DK,
                    DV) = sess.run([Y, tf.gradients(Y, [q, k, v], e)],
                                   feed_dict)

                print("testBlocksparseTransformerDense", bsize)
                if not bench:
                    for op, dev, cpu in [
                        [" Y", y, Y],
                        ["DV", dv, DV],
                        ["DK", dk, DK],
                        ["DQ", dq, DQ],
                    ]:
                        self.compare_results(op, dev, cpu)
    def testBlocksparseTransformerSparse(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for bsize in (16, 32, 64):

                layout = np.ones([heads, ctx, ctx], dtype=np.bool)
                for q, k in np.ndindex(ctx, ctx):
                    if k > q:
                        layout[:, q, k] = 0
                bst = trans.BlocksparseTransformer(layout,
                                                   block_size=bsize,
                                                   mask_callback=mask_callback)

                shape = (batch, ctx * bsize, heads * state)

                if ones:
                    cpuQ = np.ones(shape, dtype=np.float32)
                    cpuK = np.ones(shape, dtype=np.float32)
                    cpuV = np.ones(shape, dtype=np.float32)
                    cpuE = np.ones(shape, dtype=np.float32)
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuV = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shape).astype(
                        np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, shape)
                k = tf.placeholder(tf.float32, shape)
                v = tf.placeholder(tf.float32, shape)
                e = tf.placeholder(tf.float32, shape)

                feed_dict = {q: cpuQ, k: cpuK, v: cpuV, e: cpuE}

                qf = ew.float_cast(q, dtype=tf.float16)
                kf = ew.float_cast(k, dtype=tf.float16)
                vf = ew.float_cast(v, dtype=tf.float16)

                w = bst.query_key_op(qf, kf)
                w = bst.masked_softmax(w, scale=scale)
                y = bst.weight_value_op(w, vf)

                y = ew.float_cast(y, dtype=tf.float32)

                dq, dk, dv = tf.gradients(y, [q, k, v], e)
                y, dq, dk, dv = sess.run([y, dq, dk, dv], feed_dict)

                W = bst.nt_test(cpuQ, cpuK)
                W = bst.masked_softmax_test(W, scale=scale)
                Y = bst.nn_test(W, cpuV)

                DV = bst.tn_test(W, cpuE)
                DW = bst.nt_test(cpuE, cpuV)

                DW = bst.masked_softmax_grad_test(DW, W, scale=scale)

                DQ = bst.nn_test(DW, cpuK)
                DK = bst.tn_test(DW, cpuQ)

                print("testBlocksparseTransformerSparse", bsize)
                if not bench:
                    for op, dev, cpu in [
                        [" Y", y, Y],
                        ["DV", dv, DV],
                        ["DK", dk, DK],
                        ["DQ", dq, DQ],
                    ]:
                        self.compare_results(op, dev, cpu)
Esempio n. 22
0
    def forward(self, inputs, states, ema=None):

        hps = self.hps
        bsmm = hps.bsmm

        with tf.variable_scope(self.scope) as scope:

            self.param_names = list("amifou")
            for i in range(1 if hps.share_isteps else hps.isteps):
                self.param_names.append("h%d" % i)

            self.params = dict()

            for p in self.param_names:

                bsmm_p, size = (bsmm["x"],
                                hps.nproj_in) if p in "am" else (bsmm[p],
                                                                 hps.nhidden)

                b_init = ones_initializer() if p == 'f' else zeros_initializer(
                )

                w = hps.get_variable("w_" + p, bsmm_p.w_shape,
                                     bsmm_p.identity_init())
                g = hps.get_variable("g_" + p, [size], ones_initializer())
                b = hps.get_variable("b_" + p, [size], b_init)

                if ema is not None:
                    w = ema.average(w)
                    g = ema.average(g)
                    b = ema.average(b)

                wc = ew.float_cast(w, dtype=hps.dtype)

                self.params[p] = (wc, g, b, w)

            c, h = tf.unstack(states, num=2)
            c = ew.float_cast(c, dtype=hps.dtype)
            h = ew.float_cast(h, dtype=hps.dtype)

            wm, gm, bm = self.params["m"][0:3]
            wa, ga, ba = self.params["a"][0:3]

            self.inputs = inputs
            self.outputs = []
            self.segments = []
            for xgroup in inputs:

                if hps.recompute and self.train:
                    # We compute gradient one segment at a time, so prevent tf.gradients from going too far.
                    # We also want to add control inputs to the start of the segment so having wrappers
                    # around the segment inputs is handy.
                    seg = [(tf.stop_gradient(c), tf.stop_gradient(h))]
                    self.segments.append(seg)

                # delay input expansion to just prior to use (saves memory)
                with tf.control_dependencies([h]):
                    xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype)
                    xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype)

                xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis)
                xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis)

                masks = []
                for m, a in zip(xwm, xwa):
                    m = layer_norm(m, gm, bm, axis=hps.axis)
                    a = layer_norm(a, ga, ba, axis=hps.axis)

                    c, h, mask = self.cell(c, h, m, a)
                    _masks = [mask]
                    for _ in range(1, hps.lsteps):
                        c, h, mask = self.cell(c, h, None, None)
                        _masks.append(mask)
                    masks.append(_masks)

                    self.outputs.append(h)

                if hps.recompute and self.train:
                    with tf.name_scope("f_seg_%04d_%d" %
                                       (len(self.segments) - 1, len(seg) - 1)):

                        c_seg, h_seg = seg[0]

                        with tf.control_dependencies([h_seg]):
                            xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype)
                            xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype)
                        xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis)
                        xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis)

                        for m, a, mask in zip(xwm, xwa, masks):
                            m = layer_norm(m, gm, bm, axis=hps.axis)
                            a = layer_norm(a, ga, ba, axis=hps.axis)

                            c_seg, h_seg, _ = self.cell(
                                c_seg, h_seg, m, a, mask[0])
                            for i in range(1, hps.lsteps):
                                c_seg, h_seg, _ = self.cell(
                                    c_seg, h_seg, None, None, mask[i])

                            seg.append((c_seg, h_seg))

            c = ew.float_cast(c, dtype=tf.float32)
            h = ew.float_cast(h, dtype=tf.float32)
            states = tf.stack([c, h], 0)

            # We calculate the gradient internally.
            # Don't let other layer's gradients flow into here.
            # This is possible because the last cell has free c and h
            # params that are popluated with zeros in the gradients pass.
            outputs = [tf.stop_gradient(x) for x in self.outputs]

        return outputs, states
Esempio n. 23
0
def fp16(x):
    # no need to cast the gradients back to fp32 as the all-reduce and optimizers handle fp16/fp32 mixed precision
    return float_cast(x, dtype=tf.float16, dx_dtype=tf.float16)
Esempio n. 24
0
    def testAdafactor(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for dtype in (tf.float32, tf.float16):  # tf.float16
                for shape_g in (
                    (1024, 1024 * 2),
                    (1, 1024 * 2),
                    (1024, 1023 * 1),
                    (1, 1023 * 1),
                ):

                    shape_c = (1, shape_g[1])
                    shape_r = (shape_g[0], 1)

                    if ones:
                        G = np.ones(shape_g, dtype=np.float32)
                        P = np.ones(shape_g, dtype=np.float32)
                        C = np.zeros(shape_c, dtype=np.float32)
                        R = np.zeros(shape_r, dtype=np.float32)
                    else:
                        G = np.random.uniform(-1.0, 1.0, shape_g).astype(
                            np.float16).astype(np.float32)
                        P = np.random.uniform(-1.0, 1.0, shape_g).astype(
                            np.float16).astype(np.float32)
                        C = np.random.uniform(0.0, 1.0, shape_c).astype(
                            np.float16).astype(np.float32)
                        R = np.random.uniform(0.0, 1.0, shape_r).astype(
                            np.float16).astype(np.float32)

                    g = tf.placeholder(tf.float32, G.shape)
                    p = tf.Variable(initial_value=P, name="p")
                    c = tf.Variable(initial_value=C, name="c")
                    r = tf.Variable(initial_value=R, name="r")
                    sess.run(tf.global_variables_initializer())

                    g = ew.float_cast(g, dtype=dtype)

                    # adafactor has it's own fused infinity filtering but quick test of this standalone op here.
                    g = ew.filter_infinity(g)

                    if shape_g[0] > 1:

                        p, c, r, x, _ = sess.run(adafactor2d_op(
                            p,
                            c,
                            r,
                            g,
                            beta2,
                            learn_rate,
                            grad_scale,
                            clip_thresh,
                            epsilon=epsilon,
                            zero_nans=True),
                                                 feed_dict={g: G})

                        C = beta2 * C + (1.0 - beta2) * np.mean(
                            np.square(G) + epsilon, axis=0, keepdims=True)
                        R = beta2 * R + (1.0 - beta2) * np.mean(
                            np.square(G) + epsilon, axis=1, keepdims=True)
                        LTM = np.mean(R, keepdims=True)
                        X = G / (np.sqrt(R / LTM) * np.sqrt(C))
                        RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True))

                    else:

                        r = R
                        p, c, x, _ = sess.run(adafactor1d_op(p,
                                                             c,
                                                             g,
                                                             beta2,
                                                             learn_rate,
                                                             grad_scale,
                                                             clip_thresh,
                                                             epsilon=epsilon,
                                                             zero_nans=True),
                                              feed_dict={g: G})

                        C = beta2 * C + (1.0 - beta2) * (np.square(G) +
                                                         epsilon)
                        X = G / np.sqrt(C)
                        RMS_X = np.sqrt(np.mean(np.square(X), keepdims=True))

                    P -= learn_rate * X / np.maximum(1.0, RMS_X / clip_thresh)

                    print("testAdafactor", dtype)
                    for op, dev, cpu in [
                        ["C", c, C],
                        ["R", r, R],
                        ["X", x, X],
                        ["P", p, P],
                    ]:
                        self.compare_results(op, dev, cpu)
Esempio n. 25
0
    def forward(self, inputs, states, ema=None):

        hps = self.hps
        bsmm = hps.bsmm

        with tf.variable_scope(self.scope) as scope:

            self.param_names = list("am")
            for i in range(1 if hps.share_isteps else hps.isteps):
                self.param_names.append("h%d" % i)

            self.params = dict()

            for p in self.param_names:

                bsmm_p, size = (bsmm["x"],
                                hps.nproj_in) if p in "am" else (bsmm[p],
                                                                 hps.nhidden)

                w = hps.get_variable("w_" + p, bsmm_p.w_shape,
                                     bsmm_p.identity_init())
                g = hps.get_variable("g_" + p, [size], ones_initializer())
                b = hps.get_variable("b_" + p, [size], zeros_initializer())

                if ema is not None:
                    w = ema.average(w)
                    g = ema.average(g)
                    b = ema.average(b)

                wc = ew.float_cast(w, dtype=hps.dtype)

                self.params[p] = (wc, g, b, w)

            c, h = tf.unstack(states, num=2)
            h = ew.float_cast(h, dtype=hps.dtype)

            wm, gm, bm = self.params["m"][0:3]
            wa, ga, ba = self.params["a"][0:3]

            self.inputs = inputs
            self.outputs = []
            self.segments = []
            for xgroup in inputs:

                # delay input expansion to just prior to use (saves memory)
                with tf.control_dependencies([h]):
                    xwm = bsmm["x"](xgroup, wm, dw_dtype=hps.dw_dtype)
                    xwa = bsmm["x"](xgroup, wa, dw_dtype=hps.dw_dtype)

                xwm = tf.split(xwm, hps.x_group_size, 1 - hps.axis)
                xwa = tf.split(xwa, hps.x_group_size, 1 - hps.axis)

                masks = []
                for m, a in zip(xwm, xwa):
                    m = layer_norm(m, gm, bm, axis=hps.axis)
                    a = layer_norm(a, ga, ba, axis=hps.axis)
                    h = self.cell(h, m, a)

                    self.outputs.append(h)

            h = ew.float_cast(h, dtype=tf.float32)
            states = tf.stack([c, h], 0)

            # We calculate the gradient internally.
            # Don't let other layer's gradients flow into here.
            # This is possible because the last cell has free c and h
            # params that are popluated with zeros in the gradients pass.
            outputs = [tf.stop_gradient(x) for x in self.outputs]

        return outputs, states
Esempio n. 26
0
    def testLayerNorm(self):
        # multi-threading screws up benchmarking
        conf = tf.ConfigProto(
            intra_op_parallelism_threads=1,
            inter_op_parallelism_threads=1)

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):
            for shape in shapes:
                # assume bigger axis is feature axis
                axis = 1 # 0 if shape[0] > shape[1] else 1

                K = shape[  axis]
                N = shape[1-axis]

                if one:
                    X = np.ones(shape, dtype=np.float32)
                    E = np.ones(shape, dtype=np.float32)
                    G = np.ones(    K, dtype=np.float32)
                    B = np.ones(    K, dtype=np.float32)
                    # for n in range(N):
                    #     X[:,n] = np.arange(K)
                else:
                    X = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
                    E = np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
                    G = np.random.uniform(-1.0, 1.0,  (K,)).astype(np.float32)
                    B = np.random.uniform(-1.0, 1.0,  (K,)).astype(np.float32)

                x = tf.constant(X)
                e = tf.constant(E)
                g = tf.constant(G)
                b = tf.constant(B)

                for dtype in dtypes:

                    # just test relu on floats (it's hard to match low precision relu with high precision behavior)
                    relu = dtype is tf.float32

                    print("K:%d N:%d Axis:%d Relu:%d dtype:%s" % (K, N, axis, relu, dtype.name))

                    Y          = layer_norm_test(X, G, B, axis=axis, segments=segments, relu=relu)
                    DX, DG, DB = layer_norm_grad_test(E, X, G, B, axis=axis, segments=segments, relu=relu)

                    y = ew.float_cast(x, dtype=dtype)
                    y = layer_norm(y, g, b, axis=axis, segments=segments, relu=relu, bench=bench)
                    y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype)

                    d = tf.gradients(y, [x, g, b], e)

                    #if bench: sess.run(y) #warmup

                    y, (dx, dg, db) = sess.run( [y, d] )
                    #y, = sess.run( [y,] )

                    if bench == 0:
                        for op, cpuA, devA in (
                            (" y:",  Y,  y),
                            ("dx:", DX, dx),
                            ("dg:", DG, dg),
                            ("db:", DB, db),):

                            difA = abs(cpuA - devA)

                            avgval  = np.average(abs(cpuA))
                            maxdif  = difA.max()
                            max_err = maxdif if avgval == 0 else maxdif / avgval

                            l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(np.square(cpuA).sum())

                            #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                            print("%s max_err%%:%10.8f L2_err: %12.10f" % (op, 100*max_err, l2_err))

                            # rtol = 1e-4 if dtype is tf.float32 else 1e-1
                            # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                            if out:
                                np.savetxt("out.txt",  difA.reshape((-1,N)), fmt='%7.3f')
                                np.savetxt("outC.txt", cpuA.reshape((-1,N)), fmt='%7.3f')
                                np.savetxt("outD.txt", devA.reshape((-1,N)), fmt='%7.3f')
                                exit()
                    print("")
    def testBlocksparseTransformerMatmul(self):

        with self.test_session(config=config) as sess, tf.device("/gpu:0"):
            for bsize in (16, 32, 64):  # 16, 32, 64

                layout = np.ones([1, ctx, ctx], dtype=np.bool)
                for q, k in np.ndindex(ctx, ctx):
                    if k > q:
                        layout[:, q, k] = 0
                #layout[:,0,:] = 1
                bst = trans.BlocksparseTransformer(layout,
                                                   heads=heads,
                                                   block_size=bsize)

                q_shape = (batch, ctx * bsize, heads * state)
                w_shape = (batch, heads, bst.blocks, bsize, bsize)

                if ones:
                    cpuQ = np.ones(q_shape, dtype=np.float32)
                    cpuK = np.ones(q_shape, dtype=np.float32)
                    cpuW = np.ones(w_shape, dtype=np.float32)
                    # cpuQ[0,0,0,:] = 1
                    # cpuK[0,0,0,:] = range(64)
                    # cpuW[0,0,0,0,:] = 1
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, q_shape).astype(
                        np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, q_shape).astype(
                        np.float16).astype(np.float32)
                    cpuW = np.random.uniform(-1.0, 1.0, w_shape).astype(
                        np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, cpuQ.shape)
                k = tf.placeholder(tf.float32, cpuK.shape)
                w = tf.placeholder(tf.float32, cpuW.shape)

                feed_dict = {q: cpuQ, k: cpuK, w: cpuW}

                qf = ew.float_cast(q, dtype=tf.float16)
                kf = ew.float_cast(k, dtype=tf.float16)
                wf = ew.float_cast(w, dtype=tf.float16)

                nt = bst.nt_op(qf, kf, bench=bench)
                nn = bst.nn_op(wf, kf, bench=bench)
                tn = bst.tn_op(wf, qf, bench=bench)

                nt = ew.float_cast(nt, dtype=tf.float32)
                nn = ew.float_cast(nn, dtype=tf.float32)
                tn = ew.float_cast(tn, dtype=tf.float32)

                #dx, db = tf.gradients(y, [x, b], e)

                print("testBlocksparseTransformerMatmul", bsize)

                nt, nn, tn = sess.run([nt, nn, tn], feed_dict)

                if not bench:

                    NT = bst.nt_test(cpuQ, cpuK)
                    NN = bst.nn_test(cpuW, cpuK)
                    TN = bst.tn_test(cpuW, cpuQ)

                    for op, dev, cpu in [
                        ["NT", nt, NT],
                        ["NN", nn, NN],
                        ["TN", tn, TN],
                    ]:
                        self.compare_results(op, dev, cpu)
Esempio n. 28
0
def group_param_grads(param_grad, group_size=8, cast32=False):

    assert group_size <= 8

    # backward walk param grad to find BlocksparseMatmulDW ops
    # this should only hit BlocksparseMatmulDWs or AddNs or FloatCasts
    ops = get_parents(param_grad, "BlocksparseMatmulDW")

    # this sorting is dependent on the op names being correctly ordered.
    ops.sort(key=lambda op: op.name.split('/')[-1], reverse=True)
    # for x in ops:
    #     print(x.name)
    # print("")
    # exit()

    # use the parent scope for the new ops
    scope = ops[-1].name.split('/')
    scope = '/'.join(scope[0:-1])

    # we're going to be using absolute names, so clear name_scope
    with tf.name_scope(None):
        offset = 0
        # graph  = tf.get_default_graph()
        while offset < len(ops):

            xs = [op.inputs[0] for op in ops[offset:offset + group_size]]
            gs = [op.inputs[1] for op in ops[offset:offset + group_size]]

            # Get the corresponding activation grad op for the last param grad op in the group
            bprop = None
            for op in gs[-1].consumers():
                if op.type == "BlocksparseMatmulDX":
                    bprop = op
            assert bprop is not None

            # get attributes of first op in group
            up = ops[offset]
            blocks = up.get_attr("blocks")
            bshift = up.get_attr("bshift")
            axis = up.get_attr("axis")
            dtype_dw = up.get_attr("dtype_dw")
            gated_dw = up.get_attr("gated_dw")
            C = up.get_attr("C")
            K = up.get_attr("K")
            bench = up.get_attr("bench") // len(xs)
            lut = up.inputs[2]
            name = "%s/matmul_concat_updat_%03d" % (scope, offset)
            gate = [up.inputs[3]] if len(op.inputs) > 3 else []

            # The first op needs to allocate a new dw tensor
            if offset == 0:
                grad = blocksparse_matmul_dw(xs,
                                             gs,
                                             lut,
                                             gate,
                                             dtype_dw=dtype_dw,
                                             gated_dw=gated_dw,
                                             blocks=blocks,
                                             bshift=bshift,
                                             axis=axis,
                                             C=C,
                                             K=K,
                                             bench=bench,
                                             name=name)
            # subsequent ops can just accumulate in place
            else:
                grad = blocksparse_matmul_dwa(xs,
                                              gs,
                                              lut,
                                              grad,
                                              gate,
                                              gated_dw=gated_dw,
                                              blocks=blocks,
                                              bshift=bshift,
                                              axis=axis,
                                              C=C,
                                              K=K,
                                              bench=bench,
                                              name=name)

            # print(grad.op.name, grad.op.device)

            # force the dw op before any more time steps are processed
            add_control_input(bprop, grad.op)

            #print(grad.op.name)

            offset += group_size

    # get the grad back to float32 if requested
    # TODO: splice the graph instead of this hack
    if cast32 and dtype_dw != tf.float32:
        grad = ew.float_cast(grad, dtype=tf.float32)

    return grad
Esempio n. 29
0
    def testEmbeddingLookup(self):

        config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            for shapeW, shapeI in shapes:

                C = shapeW[0]
                shapeY = shapeI + shapeW[1:]

                np.random.seed(int(time()))
                cpuI = np.random.randint(0, C, size=shapeI, dtype=np.int32)
                cpuW = np.random.uniform(-1.0, 1.0, shapeW).astype(np.float32)
                cpuE = np.random.uniform(-1.0, 1.0, shapeY).astype(np.float32)

                for dtype in (
                        tf.float32,
                        tf.float16,
                ):  #tf.float16, tf.float32
                    for sort in (True, False):

                        results = []
                        for device in ("gpu", "cpu"):

                            if bench and device == "cpu":
                                break

                            castW = device == "gpu" and dtype is not tf.float32
                            if castW:
                                if C <= 256:
                                    castI = tf.uint8
                                elif C <= 65536:
                                    castI = tf.uint16
                                else:
                                    castI = None
                            else:
                                castI = None

                            with tf.device("/%s:0" %
                                           device), tf.name_scope(device):

                                i = tf.placeholder(tf.int32,
                                                   cpuI.shape,
                                                   name="i")
                                w = tf.placeholder(tf.float32,
                                                   cpuW.shape,
                                                   name="w")
                                e = tf.placeholder(tf.float32,
                                                   cpuE.shape,
                                                   name="e")

                                feed_dict = {i: cpuI, w: cpuW, e: cpuE}

                                wf = ew.float_cast(w,
                                                   dtype=dtype) if castW else w
                                i = tf.cast(
                                    i, dtype=castI) if castI is not None else i

                                y = embedding_lookup(wf,
                                                     i,
                                                     sort_grad=sort,
                                                     bench=bench)

                                if castW:
                                    y = ew.float_cast(y, dtype=tf.float32)

                                dw, = tf.gradients(y, [w], e)

                                results.append(sess.run([y, dw], feed_dict))

                        if not bench:

                            for op, dev, cpu in zip(["y", "dw"], results[0],
                                                    results[1]):

                                dif = np.abs(cpu - dev)
                                avgval = np.average(abs(cpu))
                                maxdif = dif.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval
                                l2_err = np.sqrt(
                                    np.square(dif).sum()) / np.sqrt(
                                        np.square(cpu).sum())

                                print(
                                    "%s, shape:%22s, op:%3s, err:%17.12f, l2_err:%17.12f"
                                    % (dtype.name, str(
                                        cpu.shape), op, max_err, l2_err))
    def testFancyGather(self):

        config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            for shape in shapes:

                idx_shape = shape[0:2]
                idx_dim = shape[2]
                out_shape = idx_shape + shape[3:]

                for dtype in (tf.float32, ):  #tf.float16, tf.bfloat16

                    #rtol = 1e-4 if dtype is tf.float32 else 1e-1

                    #tf.reset_default_graph()
                    np.random.seed(int(time()))
                    cpuX = np.random.uniform(-1.0, 1.0,
                                             shape).astype(np.float32)
                    cpuA = np.random.randint(0,
                                             idx_dim,
                                             size=idx_shape,
                                             dtype=np.int32)
                    cpuE = np.random.uniform(-1.0, 1.0,
                                             out_shape).astype(np.float32)

                    with tf.device("/gpu:0"):

                        x = tf.placeholder(tf.float32, cpuX.shape)
                        a = tf.placeholder(tf.int32, cpuA.shape)
                        e = tf.placeholder(tf.float32, cpuE.shape)

                        feed_dict = {x: cpuX, a: cpuA, e: cpuE}

                        xf = ew.float_cast(x, dtype=dtype)
                        y = ew.float_cast(ew.fancy_gather(xf, a),
                                          dtype=tf.float32,
                                          dx_dtype=dtype)

                        devY, (devB, ) = sess.run(
                            [y, tf.gradients(y, [x], e)], feed_dict)

                        y = ew.fancy_gather(x, a, use_tf=True)

                        cpuY, (cpuB, ) = sess.run(
                            [y, tf.gradients(y, [x], e)], feed_dict)

                    for op, devT, cpuT in (("devY", devY, cpuY), ("devB", devB,
                                                                  cpuB)):

                        difA = np.abs(cpuT - devT)
                        maxdif = difA.max()
                        sumerr = (difA > .001).sum()
                        poserr = np.argmax(np.abs(difA).reshape(-1))

                        print(
                            "%s, shape:%22s, op:%s, err:%17.12f, sum_err: %d, pos_err:%d"
                            % (dtype.name, str(shape), op, maxdif, sumerr,
                               poserr))