def version_1(cls,node,tensor_dict,**kwargs): x = tensor_dict[node.input_tensor_names[0]] perm = node.get_attr_value('perm') y = ad.transpose_op(x,perm=perm) tensor_dict[node.output_tensor_names[0]] = y return y
def test_Transpose(): X = ad.Variable(name="X") y = ad.transpose_op(X, [2, 0, 1]) executor = ad.Executor([y], ctx=ctx) X_val = rand.normal(scale=0.1, size=(3, 2, 5)).astype(np.float32) res = executor.run(feed_dict={X: X_val}) Check(executor, res, [X], [y], [X_val]) print(sys._getframe().f_code.co_name, 'pass!')
def transpose_for_scores(input_tensor): output_tensor = ad.array_reshape_op(input_tensor, [ config.batch_size, -1, config.num_heads, config.d_model // config.num_heads ]) output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3]) return output_tensor
def test_transpose(shape=(2, 3, 4, 5), perm=None): ctx = ndarray.gpu(1) x = np.random.random(shape).astype(np.float32) ath_x = ad.Variable(name='x', value=x) ath_y = ad.transpose_op(ath_x, perm) ath_grad = ad.gradients(ath_y, [ath_x])[0] executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False) ath_results = [var.asnumpy() for var in executor.run()] import tensorflow as tf tf_x = tf.convert_to_tensor(x) tf_y = tf.transpose(tf_x, perm) tf_grad = tf.gradients(tf_y, tf_x) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_results = sess.run([tf_y, tf_grad]) np.testing.assert_allclose(ath_results[0], tf_results[0]) np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape)) print('Passed transpose shape op test with shape ', shape, ' and perm ', perm)
def multihead_attention(queries, keys, values, config, query_act=None, key_act=None, value_act=None, attention_mask=None, causality=False): def transpose_for_scores(input_tensor): output_tensor = ad.array_reshape_op(input_tensor, [ config.batch_size, -1, config.num_heads, config.d_model // config.num_heads ]) output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3]) return output_tensor batch_size = config.batch_size hidden_size = config.d_model num_attention_heads = config.num_heads caus_len = config.maxlen2 - 1 attention_probs_dropout_prob = config.dropout_rate size_per_head = hidden_size // num_attention_heads # reshape to 2d queries2d = ad.array_reshape_op(queries, [-1, hidden_size]) # (N * T_q, d_model) keys2d = ad.array_reshape_op(keys, [-1, hidden_size]) # (N * T_k, d_model) values2d = ad.array_reshape_op(values, [-1, hidden_size]) # (N * T_k, d_model) # linear transformation query_layer = dense(queries2d, hidden_size, hidden_size, query_act) # (N * T_k, d_model) key_layer = dense(keys2d, hidden_size, hidden_size, key_act) # (N * T_k, d_model) value_layer = dense(values2d, hidden_size, hidden_size, value_act) # (N * T_k, d_model) # transpose query_layer = transpose_for_scores(query_layer) # (N, h, T_q, d_model/h) key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h) value_layer = transpose_for_scores(value_layer) # (N, h, T_k, d_model/h) # score attention_scores = ad.batch_matmul_op(query_layer, key_layer, trans_B=True) # (N, h, T_q, T_k) attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head))) # mask if attention_mask is not None: zeros = ad.Variable('no_mask', value=np.array((0, ), dtype=np.float32), trainable=False) adder = ad.Variable('attention_mask', value=np.array((-2**32 + 1, ), dtype=np.float32), trainable=False) zeros = ad.broadcastto_op(zeros, attention_mask) adder = ad.broadcastto_op(adder, attention_mask) attention_mask = ad.where_op(attention_mask, zeros, adder) # (N, T) attention_mask = ad.array_reshape_op(attention_mask, [batch_size, 1, 1, -1]) attention_scores = attention_scores + ad.broadcastto_op( attention_mask, attention_scores) if causality: tril = ad.Variable(name='tril', value=np.tril(np.ones((caus_len, caus_len))), trainable=False) # (T, T) future_masks = ad.broadcast_shape_op( tril, [batch_size, num_attention_heads, caus_len, caus_len]) adder = ad.Variable('future_mask', value=np.array((-2**32 + 1, ), dtype=np.float32), trainable=False) adder = ad.broadcastto_op(adder, future_masks) attention_scores = ad.where_op(future_masks, attention_scores, adder) # (N, h, T, T) # probs attention_probs = ad.softmax_op(attention_scores) attention_probs = dropout(attention_probs, attention_probs_dropout_prob) context_layer = ad.batch_matmul_op(attention_probs, value_layer) context_layer = ad.transpose_op(context_layer, [0, 2, 1, 3]) outputs = ad.array_reshape_op( context_layer, [batch_size, -1, num_attention_heads * size_per_head]) # Residual connection outputs = outputs + queries # (N, T_q, d_model) # Normalize outputs = layer_norm(outputs, hidden_size) # (N, T_q, d_model) return outputs