def transformer_block(self, x, masks, scope, train=False):
        """
        core component of transformer
        performs attention + residual mlp + layer normalization
        """
        n_state = x.shape[-1].value

        with tf.variable_scope(scope):

            with tf.variable_scope('self_attention'):
                q = self.conv1d(x, 'proj_q', n_state)
                k = self.conv1d(x, 'proj_k', n_state)
                v = self.conv1d(x, 'proj_v', n_state)

                q = split_heads(q, self.config.n_head)
                k = split_heads(k, self.config.n_head)
                v = split_heads(v, self.config.n_head)

                qk = tf.matmul(q, k, transpose_b=True)  # [bs, head, len, len]
                qk += tf.cast(-10000. * (1 - masks), qk.dtype)
                qk = bs.softmax(qk, scale=1.0 / np.sqrt(n_state / self.config.n_head))
                qkv = tf.matmul(qk, v)  # [bs, head, len, dim]
                att = merge_heads(qkv)  # [bs, len, dim*head]

                # This is actually dropping out entire tokens to attend to, which might
                # seem a bit unusual, but is taken from the original Transformer paper.
                if train and self.config.dropout > 0.0:
                    att = tf.nn.dropout(att, rate=self.config.dropout)

            # Run a linear projection of `hidden_size` then add a residual
            # with `layer_input`.
            with tf.variable_scope('attention_output'):
                att = self.conv1d(att, 'proj_a', n_state)
                if train and self.config.dropout > 0.0:
                    att = tf.nn.dropout(att, rate=self.config.dropout)
                x1 = bs.add(att, x)
                x1 = self.layer_norm(x1, name='LayerNorm_att')

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope('intermediate'):
                x2 = self.conv1d(x1, 'proj_m1', n_state * self.config.mlp_ratio, fast_gelu=True)

            with tf.variable_scope('output'):
                x2 = self.conv1d(x2, 'proj_m2', n_state)
                if train and self.config.dropout > 0.0:
                    x2 = tf.nn.dropout(x2, rate=self.config.dropout)
                x = bs.add(x2, x1)
                x = self.layer_norm(x, name='LayerNorm_output')

            return x
    def testBlocksparseTransformerDense(self):
        with self.test_session(config=config) as sess, tf.device("/gpu:0"):

            batch = 2
            heads = 2
            state = 64*2
            scale = 1.0 / np.sqrt(state/heads)

            for bsize in (8, 16, 32, 64):

                ctxQ = 16
                ctxK = 16

                layout = np.ones([heads, ctxQ, ctxK], dtype=np.bool)
                bst = bs.BlocksparseTransformer(layout, block_size=bsize)

                shapeQ = (batch, ctxQ*bsize, heads*state)
                shapeK = (batch, ctxK*bsize, heads*state)

                if ones:
                    cpuQ = np.ones(shapeQ, dtype=np.float32)
                    cpuK = np.ones(shapeK, dtype=np.float32)
                    cpuV = np.ones(shapeK, dtype=np.float32)
                    cpuE = np.ones(shapeQ, dtype=np.float32)
                else:
                    cpuQ = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32)
                    cpuK = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32)
                    cpuV = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32)
                    cpuE = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32)

                q = tf.placeholder(tf.float32, shapeQ)
                k = tf.placeholder(tf.float32, shapeK)
                v = tf.placeholder(tf.float32, shapeK)
                e = tf.placeholder(tf.float32, shapeQ)

                feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE }

                qf = bs.float_cast(q, dtype=tf.float16)
                kf = bs.float_cast(k, dtype=tf.float16)
                vf = bs.float_cast(v, dtype=tf.float16)

                w = bst.query_key_op(qf, kf, bench=bench)
                w = bst.softmax(w, scale=scale)
                y = bst.weight_value_op(w, vf, bench=bench)

                qf = bs.transpose_0213(tf.reshape(qf, [batch, ctxQ*bsize, heads, state]))
                kf = bs.transpose_0213(tf.reshape(kf, [batch, ctxK*bsize, heads, state]))
                vf = bs.transpose_0213(tf.reshape(vf, [batch, ctxK*bsize, heads, state]))
                W = tf.matmul(qf, kf, transpose_b=True)
                W = bs.softmax(W, scale=scale)
                Y = tf.matmul(W, vf)
                Y = tf.reshape(bs.transpose_0213(Y), [batch, ctxQ*bsize, heads*state])

                y = bs.float_cast(y, dtype=tf.float32)
                Y = bs.float_cast(Y, dtype=tf.float32)

                y, (dq, dk, dv) = sess.run( [ y, tf.gradients(y, [q, k, v], e) ], feed_dict )
                Y, (DQ, DK, DV) = sess.run( [ Y, tf.gradients(Y, [q, k, v], e) ], feed_dict )

                print("testBlocksparseTransformerDense", bsize)
                if not bench:
                    for op, dev, cpu in [
                        [ " Y",  y,  Y ],
                        [ "DV", dv, DV ],
                        [ "DK", dk, DK ],
                        [ "DQ", dq, DQ ],
                    ]:
                        self.compare_results(op, dev, cpu)
Beispiel #3
0
 def attention_softmax(qk_scores, scale):
     return bs.softmax(qk_scores, scale)