def transformer_block(self, x, masks, scope, train=False): """ core component of transformer performs attention + residual mlp + layer normalization """ n_state = x.shape[-1].value with tf.variable_scope(scope): with tf.variable_scope('self_attention'): q = self.conv1d(x, 'proj_q', n_state) k = self.conv1d(x, 'proj_k', n_state) v = self.conv1d(x, 'proj_v', n_state) q = split_heads(q, self.config.n_head) k = split_heads(k, self.config.n_head) v = split_heads(v, self.config.n_head) qk = tf.matmul(q, k, transpose_b=True) # [bs, head, len, len] qk += tf.cast(-10000. * (1 - masks), qk.dtype) qk = bs.softmax(qk, scale=1.0 / np.sqrt(n_state / self.config.n_head)) qkv = tf.matmul(qk, v) # [bs, head, len, dim] att = merge_heads(qkv) # [bs, len, dim*head] # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. if train and self.config.dropout > 0.0: att = tf.nn.dropout(att, rate=self.config.dropout) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope('attention_output'): att = self.conv1d(att, 'proj_a', n_state) if train and self.config.dropout > 0.0: att = tf.nn.dropout(att, rate=self.config.dropout) x1 = bs.add(att, x) x1 = self.layer_norm(x1, name='LayerNorm_att') # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope('intermediate'): x2 = self.conv1d(x1, 'proj_m1', n_state * self.config.mlp_ratio, fast_gelu=True) with tf.variable_scope('output'): x2 = self.conv1d(x2, 'proj_m2', n_state) if train and self.config.dropout > 0.0: x2 = tf.nn.dropout(x2, rate=self.config.dropout) x = bs.add(x2, x1) x = self.layer_norm(x, name='LayerNorm_output') return x
def testBlocksparseTransformerDense(self): with self.test_session(config=config) as sess, tf.device("/gpu:0"): batch = 2 heads = 2 state = 64*2 scale = 1.0 / np.sqrt(state/heads) for bsize in (8, 16, 32, 64): ctxQ = 16 ctxK = 16 layout = np.ones([heads, ctxQ, ctxK], dtype=np.bool) bst = bs.BlocksparseTransformer(layout, block_size=bsize) shapeQ = (batch, ctxQ*bsize, heads*state) shapeK = (batch, ctxK*bsize, heads*state) if ones: cpuQ = np.ones(shapeQ, dtype=np.float32) cpuK = np.ones(shapeK, dtype=np.float32) cpuV = np.ones(shapeK, dtype=np.float32) cpuE = np.ones(shapeQ, dtype=np.float32) else: cpuQ = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32) cpuK = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32) cpuV = np.random.uniform(-1.0, 1.0, shapeK).astype(np.float16).astype(np.float32) cpuE = np.random.uniform(-1.0, 1.0, shapeQ).astype(np.float16).astype(np.float32) q = tf.placeholder(tf.float32, shapeQ) k = tf.placeholder(tf.float32, shapeK) v = tf.placeholder(tf.float32, shapeK) e = tf.placeholder(tf.float32, shapeQ) feed_dict = { q: cpuQ, k: cpuK, v: cpuV, e: cpuE } qf = bs.float_cast(q, dtype=tf.float16) kf = bs.float_cast(k, dtype=tf.float16) vf = bs.float_cast(v, dtype=tf.float16) w = bst.query_key_op(qf, kf, bench=bench) w = bst.softmax(w, scale=scale) y = bst.weight_value_op(w, vf, bench=bench) qf = bs.transpose_0213(tf.reshape(qf, [batch, ctxQ*bsize, heads, state])) kf = bs.transpose_0213(tf.reshape(kf, [batch, ctxK*bsize, heads, state])) vf = bs.transpose_0213(tf.reshape(vf, [batch, ctxK*bsize, heads, state])) W = tf.matmul(qf, kf, transpose_b=True) W = bs.softmax(W, scale=scale) Y = tf.matmul(W, vf) Y = tf.reshape(bs.transpose_0213(Y), [batch, ctxQ*bsize, heads*state]) y = bs.float_cast(y, dtype=tf.float32) Y = bs.float_cast(Y, dtype=tf.float32) y, (dq, dk, dv) = sess.run( [ y, tf.gradients(y, [q, k, v], e) ], feed_dict ) Y, (DQ, DK, DV) = sess.run( [ Y, tf.gradients(Y, [q, k, v], e) ], feed_dict ) print("testBlocksparseTransformerDense", bsize) if not bench: for op, dev, cpu in [ [ " Y", y, Y ], [ "DV", dv, DV ], [ "DK", dk, DK ], [ "DQ", dq, DQ ], ]: self.compare_results(op, dev, cpu)
def attention_softmax(qk_scores, scale): return bs.softmax(qk_scores, scale)