Exemple #1
0
def test_to_sequence_basic(device_id):
    dev = cntk_device(device_id)
    x = C.input_variable((C.FreeDimension, 2))
    x_seq = C.to_sequence(x)
    assert len(x_seq.dynamic_axes) == 2

    x_data = np.asarray([[[1, 2], [-1000, -1000]], [[3, 4], [5, 6]]], dtype=np.float32)
    result = x_seq.eval({x : x_data}, device=dev)
    assert np.array_equal(result, x_data)

    x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=True)
    x_seq_lens = C.input_variable(())
    x_seq = C.to_sequence(x, x_seq_lens)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu())
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    csr_seq2 = _to_csr([seq2_data, [[0, 0, 0], [0, 0, 0]]])
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 2, 3), device=C.cpu())

    x_data = C.Value.create(C.input_variable((2, 2, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=dev).data
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev, as_numpy=False)
    result_dense = _to_dense(result, True)
    assert np.array_equal(result_dense[0], seq1_data)
    assert np.array_equal(result_dense[1], [seq2_data])
Exemple #2
0
def test_to_sequence_basic(device_id):
    dev = cntk_device(device_id)
    x = C.input_variable((C.FreeDimension, 2))
    x_seq = C.to_sequence(x)
    assert len(x_seq.dynamic_axes) == 2

    x_data = np.asarray([[[1, 2], [-1000, -1000]], [[3, 4], [5, 6]]], dtype=np.float32)
    result = x_seq.eval({x : x_data}, device=dev)
    assert np.array_equal(result, x_data)

    x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=True)
    x_seq_lens = C.input_variable(())
    x_seq = C.to_sequence(x, x_seq_lens)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    csr_seq1 = _to_csr(seq1_data)
    ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu())
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    csr_seq2 = _to_csr([seq2_data, [[0, 0, 0], [0, 0, 0]]])
    ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 2, 3), device=C.cpu())

    x_data = C.Value.create(C.input_variable((2, 2, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=dev).data
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev, as_numpy=False)
    result_dense = _to_dense(result, True)
    assert np.array_equal(result_dense[0], seq1_data)
    assert np.array_equal(result_dense[1], [seq2_data])
Exemple #3
0
    def inner(a):
        values, valid = C.sequence.unpack(a, padding_value=0).outputs
        values_reversed = C.slice(values, 0, 0, 0, -1)
        valid_reversed = C.slice(valid, 0, 0, 0, -1)

        values_seq = C.to_sequence(values_reversed)
        valid_seq = C.to_sequence(C.expand_dims(valid_reversed, axis=-1))
        a_reversed = C.sequence.gather(values_seq, valid_seq)
        return a_reversed
Exemple #4
0
    def inner(a, b):
        a_unpacked, a_mask = C.sequence.unpack(a, padding_value=0).outputs
        b_unpacked, b_mask = C.sequence.unpack(b, padding_value=0).outputs

        ab_unpacked = C.splice(a_unpacked, b_unpacked, axis=0)
        ab_mask = C.expand_dims(C.splice(a_mask, b_mask), axis=-1)

        ab_w_pad = C.to_sequence(ab_unpacked)
        ab_condition = C.to_sequence(ab_mask)

        ab = C.sequence.gather(ab_w_pad, ab_condition)
        return ab
Exemple #5
0
def test_sequence_unpack_basic(device_id):
    dev = cntk_device(device_id)

    x = C.input((C.FreeDimension, 2, 3), is_sparse=False)
    x_seq_lens = C.input(())
    x_seq = C.to_sequence(x, x_seq_lens)
    x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0)
    x_seq_unpacked_value_output = x_seq_unpacked.outputs[0]
    x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1]
    assert len(x_seq_unpacked_value_output.dynamic_axes) == 1
    assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    x_data = [
        np.asarray(seq1_data, dtype=np.float32),
        np.asarray(
            [seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]],
            dtype=np.float32)
    ]
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq_unpacked.eval({
        x: x_data,
        x_seq_lens: x_seq_lens_data
    },
                                 device=dev)
    value = result[x_seq_unpacked_value_output]
    mask = result[x_seq_unpacked_mask_output]
    assert np.array_equal(value[0], seq1_data)
    assert np.array_equal(value[1], [
        seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]]
    ])
    assert np.array_equal(mask, [[1, 1], [1, 0]])
Exemple #6
0
def test_sequence_unpack_basic(device_id):
    dev = cntk_device(device_id)

    # Unpack a placeholder
    p = C.placeholder()
    p_unpacked_outputs = C.sequence.unpack(p, padding_value=0).outputs
    assert len(p_unpacked_outputs) == 2

    x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=False)
    x_seq_lens = C.input_variable(())
    x_seq = C.to_sequence(x, x_seq_lens)
    x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0)
    x_seq_unpacked_value_output = x_seq_unpacked.outputs[0]
    x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1]
    assert len(x_seq_unpacked_value_output.dynamic_axes) == 1
    assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    x_data = [np.asarray(seq1_data, dtype=np.float32), np.asarray([seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]], dtype=np.float32)]
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq_unpacked.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev)
    value = result[x_seq_unpacked_value_output]
    mask = result[x_seq_unpacked_mask_output]
    assert np.array_equal(value[0], seq1_data)
    assert np.array_equal(value[1], [seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]]])
    assert np.array_equal(mask, [[1, 1], [1, 0]])
Exemple #7
0
def pad(x, pattern, mode=C.CONSTANT_PAD, constant_value=0, name=''):
    """
    Pads a tensor in the sequence axis according to the specified patterns.
    Three padding modes are supported: CONSTANT / REFLECT / SYMMETRIC.

    Arguments:
        x: tensor to be padded.
        pattern (tuple with 2 integers): how many values to add before and after the contents in the sequence axis.
        mode (int): padding mode: C.ops.CONSTANT_PAD, C.ops.REFLECT_PAD and C.ops.SYMMETRIC_PAD
        constant_value: the value used to fill the padding cells, only meaningful under CONSTANT mode.
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`
    """
    if not all(isinstance(i, int) for i in pattern) or not isinstance(pattern, tuple):
        raise ValueError(f"pattern {pattern} must be a tuple with 2 integers")

    ndim = len(x.shape)
    null_pattern = [(0, 0)] * ndim
    final_pattern = [pattern] + null_pattern

    b, valid = C.sequence.unpack(x, padding_value=0).outputs
    c = C.pad(b, final_pattern, mode=mode, constant_value=constant_value)
    seq_length = C.reduce_sum(valid, axis=0) + C.Constant(sum(pattern))
    d = C.to_sequence(c, seq_length, name=name)
    return d
Exemple #8
0
def test_gather_op(device_id, precision):
    a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])]
    a = C.input_variable((2,1))
    r_data = np.arange(12).reshape(6,2).astype('f')
    r = C.parameter(shape=r_data.data, init=r_data)
    res = C.gather(r, a).eval({a:a_data})
    expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]])
    assert np.array_equal(res, expectd)

    grads = C.gather(r, a).grad({a:a_data}, [r])
    expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32)
    assert np.array_equal(grads, expectd_grad)

    #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed)
    indices_params = C.parameter(shape=(1,), init=1.0)
    grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params])
    assert np.array_equal(grads[r], expectd_grad)
    assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32))


    b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])]
    b = C.input_variable((2,2))
    res2 = C.gather(r, b).eval({b:b_data})

    expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]])
    assert np.array_equal(res2, expectd2)

    #the following small model is to test the memory reuse issue of gather node.
    x = C.input((3, 4))
    x1 = C.to_sequence(x)
    w = C.parameter((5, 6), init=1)
    z = C.gather(w, x1)
    assert z.shape == (4, 6)
    #need the unpack node to trigger memory reuse.
    f = C.sequence.unpack(z, 0, no_mask_output=True)
    y = C.input((3, 4, 6))
    loss = C.reduce_mean(C.square(f - y), axis=-1)
    loss = C.reduce_mean(loss, axis=C.Axis.all_axes())

    g = C.constant(0, shape=w.shape)
    u = C.assign(w, g + 1)
    learner = C.cntk_py.universal_learner([w], [g], u)
    trainer = C.trainer.Trainer(loss, [loss], [learner])
    indices = np.asarray([[[1, 2, 1, 2]]])
    input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0)
    lable = np.full((10, 3, 4, 6), 2)
    trainer.train_minibatch({x: input, y: lable})
    # the 2nd and 3rd rows should be udpated by gradients.
    assert np.mean(w.value[1, :]) < 1
    assert np.mean(w.value[2, :]) < 1
    # the other three rows should keep as 1
    assert np.isclose(np.mean(w.value[0, :]), 1)
    assert np.isclose(np.mean(w.value[3, :]), 1)
    assert np.isclose(np.mean(w.value[4, :]), 1)
Exemple #9
0
def test_gather_op(device_id, precision):
    a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])]
    a = C.input_variable((2,1))
    r_data = np.arange(12).reshape(6,2).astype('f')
    r = C.parameter(shape=r_data.data, init=r_data)
    res = C.gather(r, a).eval({a:a_data})
    expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]])
    assert np.array_equal(res, expectd)

    grads = C.gather(r, a).grad({a:a_data}, [r])
    expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32)
    assert np.array_equal(grads, expectd_grad)

    #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed)
    indices_params = C.parameter(shape=(1,), init=1.0)
    grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params])
    assert np.array_equal(grads[r], expectd_grad)
    assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32))


    b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])]
    b = C.input_variable((2,2))
    res2 = C.gather(r, b).eval({b:b_data})

    expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]])
    assert np.array_equal(res2, expectd2)

    #the following small model is to test the memory reuse issue of gather node.
    x = C.input((3, 4))
    x1 = C.to_sequence(x)
    w = C.parameter((5, 6), init=1)
    z = C.gather(w, x1)
    assert z.shape == (4, 6)
    #need the unpack node to trigger memory reuse.
    f = C.sequence.unpack(z, 0, no_mask_output=True)
    y = C.input((3, 4, 6))
    loss = C.reduce_mean(C.square(f - y), axis=-1)
    loss = C.reduce_mean(loss, axis=C.Axis.all_axes())

    g = C.constant(0, shape=w.shape)
    u = C.assign(w, g + 1)
    learner = C.cntk_py.universal_learner([w], [g], u)
    trainer = C.trainer.Trainer(loss, [loss], [learner])
    indices = np.asarray([[[1, 2, 1, 2]]])
    input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0)
    lable = np.full((10, 3, 4, 6), 2)
    trainer.train_minibatch({x: input, y: lable})
    # the 2nd and 3rd rows should be udpated by gradients.
    assert np.mean(w.value[1, :]) < 1
    assert np.mean(w.value[2, :]) < 1
    # the other three rows should keep as 1
    assert np.isclose(np.mean(w.value[0, :]), 1)
    assert np.isclose(np.mean(w.value[3, :]), 1)
    assert np.isclose(np.mean(w.value[4, :]), 1)
Exemple #10
0
def zeros_like(x, seq_length: int):
    """ helper function to construct a sequence of zeros """
    if seq_length > 1:
        b = C.zeros_like(C.sequence.slice(x, 0, seq_length))
    elif seq_length == 1:
        b = C.to_sequence(
            C.expand_dims(C.zeros_like(C.sequence.first(x)),
                          axis=C.Axis.new_leading_axis()))
    else:
        raise ValueError(f"length ({seq_length}) must be larger than 0")

    return b
Exemple #11
0
def test_to_sequence_backprop(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels')
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]]
    seq2_data = [[0, 0, 1], [0, 1, 1]]
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data},
                                           wrt=ce.parameters, outputs=[ce], as_numpy=False)

    # Create a clone of the model that uses a non-sequence input
    # and converts it to a sequence using to_sequence
    x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features')
    x_seq_lens = C.input_variable((), name='sequence_lengths')
    x_seq = C.to_sequence(x_non_seq_input, x_seq_lens)
    x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input)
    ce_clone = ce.clone('share', {x_seq_input : x_seq})

    x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3))
    x_seq_lens_data = np.asarray([3, 2], dtype=np.float32)

    x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features')
    label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels')
    x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths')
    param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data},
                                                 wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False)


    assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0])
    assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1])

    for param in param_grads_1:
        if not param_grads_1[param].is_sparse:
            reference_grad_value = param_grads_1[param].asarray()
            grad_value = param_grads_2[param].asarray()
            assert np.array_equal(reference_grad_value, grad_value)
Exemple #12
0
def test_to_sequence_backprop(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels')
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]]
    seq2_data = [[0, 0, 1], [0, 1, 1]]
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data},
                                           wrt=ce.parameters, outputs=[ce], as_numpy=False)

    # Create a clone of the model that uses a non-sequence input
    # and converts it to a sequence using to_sequence
    x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features')
    x_seq_lens = C.input_variable((), name='sequence_lengths')
    x_seq = C.to_sequence(x_non_seq_input, x_seq_lens)
    x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input)
    ce_clone = ce.clone('share', {x_seq_input : x_seq})

    x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3))
    x_seq_lens_data = np.asarray([3, 2], dtype=np.float32)

    x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features')
    label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels')
    x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths')
    param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data},
                                                 wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False)


    assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0])
    assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1])

    for param in param_grads_1:
        if not param_grads_1[param].is_sparse:
            reference_grad_value = param_grads_1[param].asarray()
            grad_value = param_grads_2[param].asarray()
            assert np.array_equal(reference_grad_value, grad_value)
Exemple #13
0
def test_to_sequence_error_for_operand_with_sequence_axis():
    x = C.sequence.input(C.FreeDimension, 2)
    with pytest.raises(ValueError):
        op = C.to_sequence(x)
Exemple #14
0
def batchmatmul(left,
                right,
                output_rank=1,
                infer_input_rank_to_map=C.TIMES_NO_INFERRED_INPUT_RANK,
                name=''):
    """ Batch Matrix Multiplication

    The output of this operation is the matrix product of the two input batch matrices.

    This implementation is similar to tensorflow.matmul.

    Currently assumes the first axis to be the static batch axis. Does not accept multiple static batch axis.

    Example:
        a = C.sequence.input_variable((3, 4, 5))     # batch matrix
        b = C.sequence.input_variable((3, 5, 6))     # batch matrix
        c = Cx.batchmatmul(a, b)
        assert c.shape == (3, 4, 6)                  # 3 is treated as a batch axis


        a = C.sequence.input_variable((3, 4, 5))     # batch matrix
        b = C.sequence.input_variable((3, 5, 6, 7))  # batch tensor
        c = Cx.batchmatmul(a, b, output_rank=2)
        assert c.shape == (3, 4, 6, 7)               # 3 is treated as a batch axis


        a = C.input_variable((3, 4, 5))              # batch matrix
        b = C.input_variable((3, 5, 6, 7))           # batch tensor
        c = Cx.batchmatmul(a, b, output_rank=2)
        assert c.shape == (3, 4, 6, 7)


    Arguments:
        left: left side matrix or tensor
        right: right side matrix or tensor
        output_rank (int): in case we have tensors as arguments, output_rank represents
            the number of axes to be collapsed in order to transform the tensors
            into matrices, perform the operation and then reshape back (explode the axes)
        infer_input_rank_to_map (int): meant for internal use only. Always use default value
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`
    """

    left_shape = left.shape
    right_shape = right.shape

    seq_axis_present = len(left.dynamic_axes) == 2
    static_batch_axis = left_shape[
        0]  # assumes the first axis to be the static batch axis.

    if left_shape[0] != right_shape[0]:
        raise ValueError(
            "first axis of left operand and right operand must be the same")

    if (left_shape[0] < 0 or right_shape[0] < 0) and seq_axis_present:
        raise ValueError(
            "Static batch axis cannot be a free axis when dynamic sequence axis is also present"
        )

    # Combine dynamic sequence axis and static batch axis
    if not seq_axis_present:
        left_unpacked = left
        right_unpacked = right
    else:
        left_unpacked = C.sequence.unpack(left,
                                          padding_value=0,
                                          no_mask_output=True)
        right_unpacked = C.sequence.unpack(right,
                                           padding_value=0,
                                           no_mask_output=True)

        left_unpacked = C.reshape(left_unpacked, (-1, ) + left_shape[1:])
        right_unpacked = C.reshape(right_unpacked, (-1, ) + right_shape[1:])

    # Fold static batch axis into dynamic sequence axis
    left_folded = C.to_sequence(
        left_unpacked
    )  # do not set sequence length as batch axis has been folded in
    right_folded = C.to_sequence_like(
        right_unpacked, left_folded
    )  # seq_length / axis set here to tell cntk they have the same seq axis

    # Matrix Multiply when no static batch axis is present
    result = C.times(left_folded,
                     right_folded,
                     output_rank=output_rank,
                     infer_input_rank_to_map=infer_input_rank_to_map)

    # Split dynamic sequence axis back to original dynamic sequence and static batch axis
    result_unpacked = C.sequence.unpack(result,
                                        padding_value=0,
                                        no_mask_output=True)
    if not seq_axis_present:
        result_packed = C.reshape(result_unpacked,
                                  (static_batch_axis, ) + result.shape)
    else:
        result_unfolded = C.reshape(result_unpacked,
                                    (-1, static_batch_axis) + result.shape)
        result_packed = C.to_sequence_like(result_unfolded, left)

    return _inject_name(result_packed, name)
Exemple #15
0
def test_to_sequence_error_for_operand_with_sequence_axis():
    x = C.sequence.input_variable(C.FreeDimension, 2)
    with pytest.raises(ValueError):
        op = C.to_sequence(x)
    def attention_layer(self, context, query, dim):
        input_ph = C.placeholder(shape=(dim, ))
        input_mem = C.placeholder(shape=(dim, ))
        with C.layers.default_options(bias=False, activation=C.relu):
            attn_proj_enc = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]

        cln_mem_ph = C.placeholder()  # [#,q][?=d]
        cln_inp_ph = C.placeholder()  # [#,c][?=d]
        unpack_inputs, inputs_mask = C.sequence.unpack(
            cln_inp_ph, 0).outputs  # [#][*=c,d] [#][*=c]
        expand_inputs = C.sequence.broadcast_as(unpack_inputs,
                                                cln_mem_ph)  # [#,q][*=c,d]
        matrix = C.reshape(
            C.times_transpose(cln_mem_ph, expand_inputs) /
            (self.hidden_dim**0.5), (-1, ))  # [#,q][*=c]
        matrix = C.element_select(
            C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix,
            C.constant(-1e30))
        logits = C.softmax(matrix, axis=0, name='level 1 weight')  # [#,q][*=c]
        trans_expand_inputs = C.transpose(expand_inputs,
                                          [1, 0])  # [#,q][d,*=c]
        q_over_c = C.reshape(
            C.reduce_sum(logits * trans_expand_inputs, axis=1),
            (-1, )) / (self.hidden_dim**0.5)  # [#,q][d]
        new_q = C.splice(cln_mem_ph, q_over_c)  # [#,q][2*d]
        # over
        unpack_matrix, matrix_mask = C.sequence.unpack(
            matrix, 0).outputs  # [#][*=q,*=c] [#][*=q]
        inputs_mask_s = C.to_sequence(C.reshape(inputs_mask,
                                                (-1, 1)))  # [#,c'][1]
        trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]),
                                          inputs_mask_s)  # [#,c'][*=q]
        trans_matrix = C.sequence.gather(trans_matrix,
                                         inputs_mask_s)  # [#,c2][*=q]
        trans_matrix = C.element_select(
            C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix,
            C.constant(-1e30))
        logits2 = C.softmax(trans_matrix, axis=0,
                            name='level 2 weight')  # [#,c2][*=c]
        unpack_new_q, new_q_mask = C.sequence.unpack(
            new_q, 0).outputs  # [#][*=q,2*d] [#][*=q]
        expand_new_q = C.transpose(
            C.sequence.broadcast_as(unpack_new_q, trans_matrix),
            [1, 0])  # [#,c2][2d,*=q]
        c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1),
                             (-1, )) / (2 * self.hidden_dim)**0.5  # [#,c2][2d]
        c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph)

        weighted_q = c_over_q.clone(C.CloneMethod.share, {
            cln_mem_ph: memory_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]
        c2c = q_over_c.clone(C.CloneMethod.share, {
            cln_mem_ph: inputs_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]

        att_context = C.splice(input_ph, weighted_q, c2c)  # 2d+2d+2d

        return C.as_block(att_context, [(input_ph, context),
                                        (input_mem, query)], 'attention_layer',
                          'attention_layer')