コード例 #1
0
ファイル: attention.py プロジェクト: haixpham/cntkx
    def attention(query, key, value):
        dk = C.reduce_sum(C.ones_like(query))  # cannot use sequence.last, will conflict with recurrence
        # dk: [#, *] [1, ] and value = int(dim_of_query)

        unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True)  # [#] [-3, key_dim]
        unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True)  # [#] [-3, value_dim]

        broadcasted_key = C.sequence.broadcast_as(unpacked_key, query)  # [#, *] [-3, key_dim]
        scaled = C.times_transpose(query, broadcasted_key) / dk
        # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim
        # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score

        # masked out invalid temporal connections to obey_sequence_order
        if obey_sequence_order and max_seq_len:
            unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs
            # unpacked_scaled: [#] [-3, -3]  <== matrix will be top right diagonally zero-ed
            # scaled_mask: [#] [-3,]

            minus_inf = C.constant(-1e+30)
            valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0))  # [] [max_seq, max_seq]
            valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled)  # [#] [max_seq, max_seq]
            valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0)  # [#] [-3, -3]
            unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf)  # [#] [-3, -3]
            scaled = C.to_sequence_like(unpacked_scaled, query)  # [#, *] [-3]

        elif obey_sequence_order and not max_seq_len:
            raise ValueError("max_seq_len must be defined when obey_sequence_order is True")

        attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query))  # [#, *] [value_dim,]
        return attended
コード例 #2
0
ファイル: sequence_test.py プロジェクト: wjfeima/CNTK
def test_lstm_over_lstm_thought_vectors_2(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances')
    conversation_lengths_input = C.input_variable((), name='conversation_sequence_lengths')
    label_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(utterances_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.user_function(UtteranceBatchReshape(model, conversation_lengths_input))
        model = C.to_sequence_like(model, label_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_input)

    sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu())
    c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu())
    c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu())
    c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu())
    c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu())
    c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu())
    c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu())

    all_utt_data = C.Value.create(C.sequence.input_variable((input_vocab_size), is_sparse=True), [c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data], device=C.cpu()).data
    conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32)
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0]]
    seq3_label_data = [[1, 0], [0, 1]]
    label_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data)]
    param_grads, loss_result = ce.grad({utterances_input : all_utt_data, label_input : label_data, conversation_lengths_input : conversation_lengths_data},
                                       wrt=ce.parameters, outputs=[ce], as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.01
    assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance)
    assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
コード例 #3
0
ファイル: sequence_test.py プロジェクト: delpart/CNTK
def test_lstm_over_lstm_thought_vectors_2(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances')
    conversation_lengths_input = C.input_variable((), name='conversation_sequence_lengths')
    label_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(utterances_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.user_function(UtteranceBatchReshape(model, conversation_lengths_input))
        model = C.to_sequence_like(model, label_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_input)

    sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu())
    c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu())
    c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu())
    c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu())
    c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu())
    c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu())
    c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu())

    all_utt_data = C.Value.create(C.sequence.input_variable((input_vocab_size), is_sparse=True), [c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data], device=C.cpu()).data
    conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32)
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0]]
    seq3_label_data = [[1, 0], [0, 1]]
    label_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data)]
    param_grads, loss_result = ce.grad({utterances_input : all_utt_data, label_input : label_data, conversation_lengths_input : conversation_lengths_data},
                                       wrt=ce.parameters, outputs=[ce], as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.01
    assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance)
    assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
コード例 #4
0
ファイル: sequence_test.py プロジェクト: delpart/CNTK
def test_lstm_over_lstm_thought_vectors(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='features')
    label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.to_sequence_like(model, label_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu())
    seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]]
    csr_seq2 = _to_csr(seq2_data)
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu())
    x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data

    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads, loss_result = ce.grad({x_seq_input : x_seq_data, label_seq_input : label_seq_data},
                                       wrt=ce.parameters, outputs=[ce], as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.02
    assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
コード例 #5
0
ファイル: sequence_test.py プロジェクト: wjfeima/CNTK
def test_lstm_over_lstm_thought_vectors(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='features')
    label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.to_sequence_like(model, label_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu())
    seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]]
    csr_seq2 = _to_csr(seq2_data)
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu())
    x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data

    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads, loss_result = ce.grad({x_seq_input : x_seq_data, label_seq_input : label_seq_data},
                                       wrt=ce.parameters, outputs=[ce], as_numpy=False)

    loss_result = loss_result.as_sequences()

    absolute_tolerance = 0.02
    assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
コード例 #6
0
ファイル: sequence_test.py プロジェクト: ondrocks/CNTK
def test_lstm_over_lstm_thought_vectors(device_id):
    previous_random_seed = C.cntk_py.get_random_seed()
    C.cntk_py.reset_random_seed(0)
    dev = cntk_device(device_id)
    input_vocab_size = 3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input((C.FreeDimension, input_vocab_size),
                                   is_sparse=True,
                                   name='features')
    label_seq_input = C.sequence.input(num_labels,
                                       is_sparse=True,
                                       sequence_axis=Axis('label_sequence'),
                                       name='labels')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.sequence.last(model)
        model = C.to_sequence_like(model, label_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim),
                                    go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]],
                 [[1, 1, 0], [0, 0, 1], [1, 0, 1]],
                 [[1, 0, 0], [0, 0, 1], [1, 1, 0]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1,
                                          shape=(3, 3, 3),
                                          device=C.cpu())
    seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]],
                 [[0, 1, 0], [1, 0, 1], [0, 0, 0]]]
    csr_seq2 = _to_csr(seq2_data)
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2,
                                          shape=(2, 3, 3),
                                          device=C.cpu())
    x_seq_data = C.Value.create(C.sequence.input((3, 3), is_sparse=True),
                                [ndarrayview1, ndarrayview2],
                                device=C.cpu()).data

    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads, loss_result = ce.grad(
        {
            x_seq_input: x_seq_data,
            label_seq_input: label_seq_data
        },
        wrt=ce.parameters,
        outputs=[ce],
        as_numpy=False)

    loss_result = loss_result.as_sequences()

    # TODO: The tolerance here is inordinately high due to the non-determinism in initialization
    # of parameters as the individual tests are not run in separate processes resulting in the
    # addition or removal of tests to affect the random initialization of parameters in all other
    # tests that do not explicitly specify the random seed. The tolerance should be lowered to
    # 0.01 after this issue in the test infrastructure has been fixed.
    absolute_tolerance = 0.02
    assert np.allclose(loss_result[0], [[0.63504], [0.673343], [0.698446]],
                       atol=absolute_tolerance)
    assert np.allclose(loss_result[1], [[0.772344], [0.64295]],
                       atol=absolute_tolerance)

    C.cntk_py.reset_random_seed(previous_random_seed)
コード例 #7
0
ファイル: __init__.py プロジェクト: newoneincntk/cntkx
def batchmatmul(left,
                right,
                output_rank=1,
                infer_input_rank_to_map=C.TIMES_NO_INFERRED_INPUT_RANK,
                name=''):
    """ Batch Matrix Multiplication

    The output of this operation is the matrix product of the two input batch matrices.

    This implementation is similar to tensorflow.matmul.

    Currently assumes the first axis to be the static batch axis. Does not accept multiple static batch axis.

    Example:
        a = C.sequence.input_variable((3, 4, 5))     # batch matrix
        b = C.sequence.input_variable((3, 5, 6))     # batch matrix
        c = Cx.batchmatmul(a, b)
        assert c.shape == (3, 4, 6)                  # 3 is treated as a batch axis


        a = C.sequence.input_variable((3, 4, 5))     # batch matrix
        b = C.sequence.input_variable((3, 5, 6, 7))  # batch tensor
        c = Cx.batchmatmul(a, b, output_rank=2)
        assert c.shape == (3, 4, 6, 7)               # 3 is treated as a batch axis


        a = C.input_variable((3, 4, 5))              # batch matrix
        b = C.input_variable((3, 5, 6, 7))           # batch tensor
        c = Cx.batchmatmul(a, b, output_rank=2)
        assert c.shape == (3, 4, 6, 7)


    Arguments:
        left: left side matrix or tensor
        right: right side matrix or tensor
        output_rank (int): in case we have tensors as arguments, output_rank represents
            the number of axes to be collapsed in order to transform the tensors
            into matrices, perform the operation and then reshape back (explode the axes)
        infer_input_rank_to_map (int): meant for internal use only. Always use default value
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`
    """

    left_shape = left.shape
    right_shape = right.shape

    seq_axis_present = len(left.dynamic_axes) == 2
    static_batch_axis = left_shape[
        0]  # assumes the first axis to be the static batch axis.

    if left_shape[0] != right_shape[0]:
        raise ValueError(
            "first axis of left operand and right operand must be the same")

    if (left_shape[0] < 0 or right_shape[0] < 0) and seq_axis_present:
        raise ValueError(
            "Static batch axis cannot be a free axis when dynamic sequence axis is also present"
        )

    # Combine dynamic sequence axis and static batch axis
    if not seq_axis_present:
        left_unpacked = left
        right_unpacked = right
    else:
        left_unpacked = C.sequence.unpack(left,
                                          padding_value=0,
                                          no_mask_output=True)
        right_unpacked = C.sequence.unpack(right,
                                           padding_value=0,
                                           no_mask_output=True)

        left_unpacked = C.reshape(left_unpacked, (-1, ) + left_shape[1:])
        right_unpacked = C.reshape(right_unpacked, (-1, ) + right_shape[1:])

    # Fold static batch axis into dynamic sequence axis
    left_folded = C.to_sequence(
        left_unpacked
    )  # do not set sequence length as batch axis has been folded in
    right_folded = C.to_sequence_like(
        right_unpacked, left_folded
    )  # seq_length / axis set here to tell cntk they have the same seq axis

    # Matrix Multiply when no static batch axis is present
    result = C.times(left_folded,
                     right_folded,
                     output_rank=output_rank,
                     infer_input_rank_to_map=infer_input_rank_to_map)

    # Split dynamic sequence axis back to original dynamic sequence and static batch axis
    result_unpacked = C.sequence.unpack(result,
                                        padding_value=0,
                                        no_mask_output=True)
    if not seq_axis_present:
        result_packed = C.reshape(result_unpacked,
                                  (static_batch_axis, ) + result.shape)
    else:
        result_unfolded = C.reshape(result_unpacked,
                                    (-1, static_batch_axis) + result.shape)
        result_packed = C.to_sequence_like(result_unfolded, left)

    return _inject_name(result_packed, name)
コード例 #8
0
    def attention_layer(self, context, query, dim):
        input_ph = C.placeholder(shape=(dim, ))
        input_mem = C.placeholder(shape=(dim, ))
        with C.layers.default_options(bias=False, activation=C.relu):
            attn_proj_enc = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]

        cln_mem_ph = C.placeholder()  # [#,q][?=d]
        cln_inp_ph = C.placeholder()  # [#,c][?=d]
        unpack_inputs, inputs_mask = C.sequence.unpack(
            cln_inp_ph, 0).outputs  # [#][*=c,d] [#][*=c]
        expand_inputs = C.sequence.broadcast_as(unpack_inputs,
                                                cln_mem_ph)  # [#,q][*=c,d]
        matrix = C.reshape(
            C.times_transpose(cln_mem_ph, expand_inputs) /
            (self.hidden_dim**0.5), (-1, ))  # [#,q][*=c]
        matrix = C.element_select(
            C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix,
            C.constant(-1e30))
        logits = C.softmax(matrix, axis=0, name='level 1 weight')  # [#,q][*=c]
        trans_expand_inputs = C.transpose(expand_inputs,
                                          [1, 0])  # [#,q][d,*=c]
        q_over_c = C.reshape(
            C.reduce_sum(logits * trans_expand_inputs, axis=1),
            (-1, )) / (self.hidden_dim**0.5)  # [#,q][d]
        new_q = C.splice(cln_mem_ph, q_over_c)  # [#,q][2*d]
        # over
        unpack_matrix, matrix_mask = C.sequence.unpack(
            matrix, 0).outputs  # [#][*=q,*=c] [#][*=q]
        inputs_mask_s = C.to_sequence(C.reshape(inputs_mask,
                                                (-1, 1)))  # [#,c'][1]
        trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]),
                                          inputs_mask_s)  # [#,c'][*=q]
        trans_matrix = C.sequence.gather(trans_matrix,
                                         inputs_mask_s)  # [#,c2][*=q]
        trans_matrix = C.element_select(
            C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix,
            C.constant(-1e30))
        logits2 = C.softmax(trans_matrix, axis=0,
                            name='level 2 weight')  # [#,c2][*=c]
        unpack_new_q, new_q_mask = C.sequence.unpack(
            new_q, 0).outputs  # [#][*=q,2*d] [#][*=q]
        expand_new_q = C.transpose(
            C.sequence.broadcast_as(unpack_new_q, trans_matrix),
            [1, 0])  # [#,c2][2d,*=q]
        c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1),
                             (-1, )) / (2 * self.hidden_dim)**0.5  # [#,c2][2d]
        c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph)

        weighted_q = c_over_q.clone(C.CloneMethod.share, {
            cln_mem_ph: memory_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]
        c2c = q_over_c.clone(C.CloneMethod.share, {
            cln_mem_ph: inputs_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]

        att_context = C.splice(input_ph, weighted_q, c2c)  # 2d+2d+2d

        return C.as_block(att_context, [(input_ph, context),
                                        (input_mem, query)], 'attention_layer',
                          'attention_layer')
コード例 #9
0
def self_attention_layer(in_dims: int,
                         out_dims: int,
                         name='self_attention',
                         as_block: bool = False,
                         k_ph: bool = False,
                         v_ph: bool = False,
                         mask_opt: bool = False) -> C.Function:
    sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims')

    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    if k_ph is False and v_ph is False:
        q = C.layers.Dense(out_dims, name=name + '_q')(
            X
        )  # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q')
        k = C.layers.Dense(out_dims, name=name + '_k')(
            X
        )  # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k')
        v = C.layers.Dense(out_dims, name=name + '_v')(
            X
        )  # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v')
    elif k_ph is True and v_ph is True:
        q = C.layers.Dense(out_dims, name=name + '_q')(X)
        k = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_k_ph')
        v = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_v_ph')
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q')
    k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k')
    v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v')

    scores = C.times_transpose(q_, k_, name=name + '_score_matrix')
    scaled = scores / sq_sa_dims  # div_k

    if mask_opt:
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')
        scaled = C.element_min(scaled, inf_mask)

    softmax = C.softmax(scaled, name=name + '_softmax')
    attention = C.times(softmax, v_, name=name + '_attention')

    result = C.to_sequence_like(attention, X)

    if as_block:
        if k_ph is False and v_ph is False:
            return C.as_block(result, [(X, X)], 'self_attention',
                              'self_attention_')
        elif k_ph is True and v_ph is True:
            return C.as_block(result, [(X, X), (k, k), (v, v)],
                              'self_attention', 'self_attention_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')
    else:
        return result
コード例 #10
0
def gpt2_self_attention(token_dims: int,
                        head_dims: int,
                        mask_opt: bool = False,
                        as_block: bool = False,
                        name: str = 'self_attention'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    # q = C.layers.Dense(token_dims, name=name+'_q')(X)
    # k = C.layers.Dense(token_dims, name=name+'_k')(X)
    # v = C.layers.Dense(token_dims, name=name+'_v')(X)

    # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w')
    # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv')

    qkv = C.layers.Dense((3, token_dims), name='qkv')(X)
    q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2]

    q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q')
    k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k')
    v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v')

    #region split multi head attention
    q_heads = [
        C.squeeze(q_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    k_heads = [
        C.squeeze(k_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    v_heads = [
        C.squeeze(v_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    #endregion

    attention_head = []
    for i in range(head_dims):
        q = q_heads[i]
        k = k_heads[i]
        v = v_heads[i]

        #region score
        # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present
        q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i))  # q seq
        k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i))  # k seq
        v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i))  # v seq

        scores = C.times_transpose(q_, k_)
        scaled = scores * (1 / C.sqrt(v_.shape[-1]))

        #region mask opt
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')

        scaled = C.element_min(scaled, inf_mask)
        #endregion

        softmax = C.softmax(scaled)
        #endregion
        #region sum
        attention = C.times(softmax, v_)
        attention_seq = C.to_sequence_like(attention, X)
        #endregion
        attention_head.append(attention_seq)


#region merge attention heads
    attention = C.splice(*attention_head, name='merged_attention')
    #endergion

    #region project
    project = C.layers.Dense(token_dims, name='project')(attention)
    #endregion

    if as_block:
        return C.as_block(project, [(X, X)], 'gpt2_self_attention',
                          'gpt2_self_attention')

    return project