Exemple #1
0
    def version_1(cls,node,tensor_dict,**kwargs):
        x = tensor_dict[node.input_tensor_names[0]]
        perm = node.get_attr_value('perm')
        y = ad.transpose_op(x,perm=perm)
        tensor_dict[node.output_tensor_names[0]] = y

        return y
Exemple #2
0
def test_Transpose():
    X = ad.Variable(name="X")
    y = ad.transpose_op(X, [2, 0, 1])
    executor = ad.Executor([y], ctx=ctx)
    X_val = rand.normal(scale=0.1, size=(3, 2, 5)).astype(np.float32)
    res = executor.run(feed_dict={X: X_val})
    Check(executor, res, [X], [y], [X_val])
    print(sys._getframe().f_code.co_name, 'pass!')
Exemple #3
0
    def transpose_for_scores(input_tensor):
        output_tensor = ad.array_reshape_op(input_tensor, [
            config.batch_size, -1, config.num_heads,
            config.d_model // config.num_heads
        ])

        output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor
def test_transpose(shape=(2, 3, 4, 5), perm=None):
    ctx = ndarray.gpu(1)
    x = np.random.random(shape).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_y = ad.transpose_op(ath_x, perm)
    ath_grad = ad.gradients(ath_y, [ath_x])[0]
    executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_y = tf.transpose(tf_x, perm)
    tf_grad = tf.gradients(tf_y, tf_x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y, tf_grad])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape))
    print('Passed transpose shape op test with shape ', shape, ' and perm ', perm)
Exemple #5
0
def multihead_attention(queries,
                        keys,
                        values,
                        config,
                        query_act=None,
                        key_act=None,
                        value_act=None,
                        attention_mask=None,
                        causality=False):
    def transpose_for_scores(input_tensor):
        output_tensor = ad.array_reshape_op(input_tensor, [
            config.batch_size, -1, config.num_heads,
            config.d_model // config.num_heads
        ])

        output_tensor = ad.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor

    batch_size = config.batch_size
    hidden_size = config.d_model
    num_attention_heads = config.num_heads
    caus_len = config.maxlen2 - 1
    attention_probs_dropout_prob = config.dropout_rate

    size_per_head = hidden_size // num_attention_heads

    # reshape to 2d
    queries2d = ad.array_reshape_op(queries,
                                    [-1, hidden_size])  # (N * T_q, d_model)
    keys2d = ad.array_reshape_op(keys, [-1, hidden_size])  # (N * T_k, d_model)
    values2d = ad.array_reshape_op(values,
                                   [-1, hidden_size])  # (N * T_k, d_model)

    # linear transformation
    query_layer = dense(queries2d, hidden_size, hidden_size,
                        query_act)  # (N * T_k, d_model)
    key_layer = dense(keys2d, hidden_size, hidden_size,
                      key_act)  # (N * T_k, d_model)
    value_layer = dense(values2d, hidden_size, hidden_size,
                        value_act)  # (N * T_k, d_model)

    # transpose
    query_layer = transpose_for_scores(query_layer)  # (N, h, T_q, d_model/h)
    key_layer = transpose_for_scores(key_layer)  # (N, h, T_k, d_model/h)
    value_layer = transpose_for_scores(value_layer)  # (N, h, T_k, d_model/h)

    # score
    attention_scores = ad.batch_matmul_op(query_layer, key_layer,
                                          trans_B=True)  # (N, h, T_q, T_k)
    attention_scores = attention_scores * (1.0 / np.sqrt(float(size_per_head)))

    # mask
    if attention_mask is not None:
        zeros = ad.Variable('no_mask',
                            value=np.array((0, ), dtype=np.float32),
                            trainable=False)
        adder = ad.Variable('attention_mask',
                            value=np.array((-2**32 + 1, ), dtype=np.float32),
                            trainable=False)
        zeros = ad.broadcastto_op(zeros, attention_mask)
        adder = ad.broadcastto_op(adder, attention_mask)
        attention_mask = ad.where_op(attention_mask, zeros, adder)  # (N, T)
        attention_mask = ad.array_reshape_op(attention_mask,
                                             [batch_size, 1, 1, -1])
        attention_scores = attention_scores + ad.broadcastto_op(
            attention_mask, attention_scores)
    if causality:
        tril = ad.Variable(name='tril',
                           value=np.tril(np.ones((caus_len, caus_len))),
                           trainable=False)  # (T, T)
        future_masks = ad.broadcast_shape_op(
            tril, [batch_size, num_attention_heads, caus_len, caus_len])
        adder = ad.Variable('future_mask',
                            value=np.array((-2**32 + 1, ), dtype=np.float32),
                            trainable=False)
        adder = ad.broadcastto_op(adder, future_masks)
        attention_scores = ad.where_op(future_masks, attention_scores,
                                       adder)  # (N, h, T, T)

    # probs
    attention_probs = ad.softmax_op(attention_scores)
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
    context_layer = ad.batch_matmul_op(attention_probs, value_layer)
    context_layer = ad.transpose_op(context_layer, [0, 2, 1, 3])
    outputs = ad.array_reshape_op(
        context_layer, [batch_size, -1, num_attention_heads * size_per_head])

    # Residual connection
    outputs = outputs + queries  # (N, T_q, d_model)

    # Normalize
    outputs = layer_norm(outputs, hidden_size)  # (N, T_q, d_model)
    return outputs