예제 #1
0
def test_cos_distane_backward():
    x = sequence.input(shape=(2, ),
                       sequence_axis=Axis("B"),
                       needs_gradient=True)
    y = sequence.input(shape=(2, ),
                       sequence_axis=Axis("B"),
                       needs_gradient=True)
    z = cosine_distance(x, y)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-0.5, 1.5, -0.3, -1]), (1, 2, 2))
    bwd, fwd = z.forward({x: a, y: b}, [z.output], set([z.output]))
    value = list(fwd.values())[0]
    expected = [[0.707107, -0.981665]]
    assert np.allclose(value, expected)
    grad = z.backward(bwd, {z.output: np.ones_like(value)}, set([x, y]))
    x_driv_expected = np.ndarray(
        (1, 2, 2),
        dtype=np.float32,
        buffer=np.float32([-1.131371, 0.565686, -0.188727, 0.018873]))
    y_driv_expected = np.ndarray(
        (1, 2, 2),
        dtype=np.float32,
        buffer=np.float32([0.424264, 0.141421, -0.174876, 0.052463]))
    assert (np.all(np.absolute(grad[x] - x_driv_expected) < 1e-6))
    assert (np.all(np.absolute(grad[y] - y_driv_expected) < 1e-6))
예제 #2
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input_variable, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
            input_map={raw_input : mbs.streams.features},
            device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1],
            [60, 61]
            ]
    data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
예제 #3
0
def test_rank0_output():
  x = sequence.input(shape=(768,), sequence_axis=Axis("B"), needs_gradient=True)
  y = sequence.input(shape=(768,), sequence_axis=Axis("B"), needs_gradient=True)
  z = cosine_distance(x, y)
  batch_num = 2
  batch_size = 30
  a = np.float32(np.random.rand(batch_num*batch_size,1500,768))
  b = np.float32(np.random.rand(batch_num*batch_size,1500,768))
  for i in range(batch_num):
    bwd, fwd = z.forward({x:a[i*batch_size:(i+1)*batch_size], y:b[i*batch_size:(i+1)*batch_size]}, [z.output], set([z.output]))
    grad = z.backward(bwd, {z.output:np.ones_like(fwd[z.output])}, set([x, y]))
예제 #4
0
def test_recurrence():
    inputAxis = Axis('inputAxis')
    stateAxis = Axis('stateAxis')
    InputSequence = SequenceOver[inputAxis]
    StateSequence = SequenceOver[stateAxis]

    # input and expected for both tests below
    x = np.reshape(np.arange(0, 25, dtype=np.float32), (1, 5, 5))
    exp = [[0.239151, 0.239151, 0.239151, 0.239151, 0.239151],
           [0.338713, 0.338713, 0.338713, 0.338713, 0.338713],
           [0.367456, 0.367456, 0.367456, 0.367456, 0.367456],
           [0.375577, 0.375577, 0.375577, 0.375577, 0.375577],
           [0.377891, 0.377891, 0.377891, 0.377891, 0.377891]]

    ####################################################
    # Test 1: Recurrence(): initial state is constant
    ####################################################
    # Note: We cannot use random init of the GRU parameters because random numbers will
    # depend on what previous tests were run. Hence, use a constant (which is not realistic).
    # TODO: Find out how to reset the random generator, then remove the constant init.
    R = Recurrence(GRU(5, init=0.05), go_backwards=False, initial_state=0.1)

    @Function
    @Signature(InputSequence[Tensor[5]])
    def F(x):
        return R(x)

    rt = F(x)
    np.testing.assert_array_almost_equal(
        rt[0], exp, decimal=6, err_msg='Error in Recurrence(GRU()) forward')

    ####################################################
    # Test 2: RecurrenceFrom(): initial state is data input
    ####################################################
    RF = RecurrenceFrom(GRU(5, init=0.05), go_backwards=False)

    @Function
    @Signature(s=StateSequence[Tensor[5]], x=InputSequence[Tensor[5]])
    def FF(s, x):
        return RF(s, x)

    s = np.ones(
        (1, 5, 5)
    ) * 0.1  # we pass the same value as the constant in the previous test to make the result the same
    rt = FF(s, x)
    np.testing.assert_array_almost_equal(
        rt[0],
        exp,
        decimal=6,
        err_msg='Error in RecurrenceFrom(GRU()) forward')
def create_inputs(vocab_dim):
    input_seq_axis = Axis('inputAxis')
    input_sequence = sequence.input_variable(shape=vocab_dim,
                                             sequence_axis=input_seq_axis)
    label_sequence = sequence.input_variable(shape=vocab_dim,
                                             sequence_axis=input_seq_axis)

    return input_sequence, label_sequence
예제 #6
0
def test_cos_distane_backward2():
  x = sequence.input(shape=(100,), sequence_axis=Axis("B"), needs_gradient=True)
  y = sequence.input(shape=(100,), sequence_axis=Axis("B"), needs_gradient=True)
  z = cosine_distance(x, y);
  np.random.seed(0)
  a = np.float32(np.random.rand(10,50,100))
  b = np.float32(np.random.rand(10,50,100))
  bwd, fwd = z.forward({x:a, y:b}, [z.output], set([z.output]))
  value = list(fwd.values())[0]
  expected_cos = numpy_cos(a,b)
  expected = expected_cos.forward()
  assert np.allclose(value, expected)
  grad = z.backward(bwd, {z.output:np.ones_like(value)}, set([x, y]))
  bwd = expected_cos.backward()
  x_driv_expected = bwd['a']
  y_driv_expected = bwd['b']
  assert (np.all(np.absolute(grad[x]-x_driv_expected) < 1e-6))
  assert (np.all(np.absolute(grad[y]-y_driv_expected) < 1e-6))
예제 #7
0
def create_inputs(vocab_dim):
    input_seq_axis = Axis('inputAxis')
    input_sequence = sequence.input(shape=vocab_dim,
                                    sequence_axis=input_seq_axis,
                                    is_sparse=use_sparse)
    label_sequence = sequence.input(shape=vocab_dim,
                                    sequence_axis=input_seq_axis,
                                    is_sparse=use_sparse)

    return input_sequence, label_sequence
예제 #8
0
def create_inputs(hidden_dim, sv_dim, vocab_dim):
    input_seq_axis = Axis('inputAxis')
    input_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis)
    # state in model, including sv vector, h vector, c vector
    sv_pair = C.input_variable(shape=sv_dim)
    inputH = C.input_variable(shape=hidden_dim)
    inputC = C.input_variable(shape=hidden_dim)
    label_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis)
    
    return input_sequence, sv_pair, label_sequence, inputH, inputC
예제 #9
0
def create_inputs(vocab_dim):
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    input_sequence = input_variable(shape=vocab_dim,
                                    dynamic_axes=input_dynamic_axes)
    label_sequence = input_variable(shape=vocab_dim,
                                    dynamic_axes=input_dynamic_axes)

    return input_sequence, label_sequence
예제 #10
0
def test_cosine_distance():
    a = np.reshape(np.arange(25.0, dtype=np.float32), (5, 5))
    b = np.reshape(np.arange(0, 5, dtype=np.float32), (1, 5))

    src = sequence.input(shape=(5), sequence_axis=Axis("Seq"))
    tgt = input(shape=(5))
    tgt_br = sequence.broadcast_as(tgt, src)
    cos_seq = cosine_distance(src, tgt_br)
    assert len(cos_seq.dynamic_axes) == 2
    assert cos_seq.dynamic_axes[1].name == "Seq"
    val = cos_seq.eval({src: [a], tgt: [b]})
    expected = [[1., 0.914659, 0.878459, 0.86155, 0.851852]]
    assert np.allclose(val, expected)
예제 #11
0
def test_recurrent_block(block_type, block_outputs_count, block_size, W_mult, H_mult, expected_res):
    input_shape = 4

    sequenceAxis = Axis('sequenceAxis')

    y = input(input_shape, dynamic_axes=[Axis.default_batch_axis(), sequenceAxis])
    data = np.reshape(np.arange(0,16, dtype=np.float32), (1,4,4))

    rnn_block = block_type(block_size, init=0.1)

    assert len(rnn_block.outputs) == block_outputs_count
    rnn_net = Recurrence(rnn_block)(y)

    assert rnn_net.b.shape == (W_mult*block_size,)
    assert rnn_net.W.shape == (input_shape, W_mult*block_size)
    assert rnn_net.H.shape == (block_size, H_mult*block_size)

    res = rnn_net.eval(data)
    expected = np.asarray(expected_res, dtype=np.float32)

    np.testing.assert_array_almost_equal(res[0], expected, decimal=6)
예제 #12
0
                                          shape=input_vocab_dim,
                                          is_sparse=True),
                       labels=StreamDef(field='S1',
                                        shape=label_vocab_dim,
                                        is_sparse=True))),
        randomize=is_training,
        epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)


########################
# define the model     #
########################

# type annotations for the two sequence types; later use InputSequence[Tensor[input_vocab_dim]]
# CNTK considers these two different types since they run over different sequence indices.
inputAxis = Axis('inputAxis')
labelAxis = Axis('labelAxis')
InputSequence = SequenceOver[inputAxis]
LabelSequence = SequenceOver[labelAxis]


# create the s2s model
def create_model():  # :: (history*, input*) -> logP(w)*
    # Embedding: (input*) --> embedded_input*
    # Right now assumes shared embedding and shared vocab size.
    embed = Embedding(embedding_dim,
                      name='embed') if use_embedding else identity

    # Encoder: (input*) --> (h0, c0)
    # Create multiple layers of LSTMs by passing the output of the i-th layer
    # to the (i+1)th layer as its input
def sequence_to_sequence_translator(debug_output=False, run_test=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = sequence.slice(raw_labels, 1,
                                    0)  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    is_first_label = sequence.is_first(label_sequence)  # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             encoder_outputH.output, hidden_dim, hidden_dim, future_value,
             future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH,
                                                      label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC,
                                                      label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH,
             recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share,
                 {decoder_history_hook.output: net_output.output})

    # Instantiate the trainer object to drive the model training
    lr_per_minibatch = learning_rate_schedule(0.5, UnitType.minibatch)
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(
        z.parameters,
        lr_per_minibatch,
        momentum_time_constant,
        gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
        gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    # setup data
    train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                              "Data", "cmudict-0.7b.train-dev-20-21.ctf")
    valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                              "Data", "tiny.ctf")

    # readers
    randomize_data = True
    if run_test:
        randomize_data = False  # because we want to get an exact error

    train_reader = create_reader(train_path, randomize_data, input_vocab_dim,
                                 label_vocab_dim)
    train_bind = {
        raw_input: train_reader.streams.features,
        raw_labels: train_reader.streams.labels
    }

    # get the vocab for printing output sequences in plaintext
    vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                              "Data", "cmudict-0.7b.mapping")
    vocab = [w.strip() for w in open(vocab_path).readlines()]
    i2w = {i: ch for i, ch in enumerate(vocab)}

    # Get minibatches of sequences to train with and perform model training
    i = 0
    mbs = 0
    minibatch_size = 72
    epoch_size = 908241
    max_epochs = 10
    training_progress_output_freq = 500

    # make things more basic for running a quicker test
    if run_test:
        epoch_size = 5000
        max_epochs = 1
        training_progress_output_freq = 30

    valid_reader = create_reader(valid_path, False, input_vocab_dim,
                                 label_vocab_dim)
    valid_bind = {
        find_arg_by_name('raw_input', ng): valid_reader.streams.features,
        find_arg_by_name('raw_labels', ng): valid_reader.streams.labels
    }

    for epoch in range(max_epochs):
        loss_numer = 0
        metric_numer = 0
        denom = 0

        while i < (epoch + 1) * epoch_size:
            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size,
                                                   input_map=train_bind)
            trainer.train_minibatch(mb_train)

            # collect epoch-wide stats
            samples = trainer.previous_minibatch_sample_count
            loss_numer += trainer.previous_minibatch_loss_average * samples
            metric_numer += trainer.previous_minibatch_evaluation_average * samples
            denom += samples

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % training_progress_output_freq == 0:
                mb_valid = valid_reader.next_minibatch(minibatch_size,
                                                       input_map=valid_bind)
                e = ng.eval(mb_valid)
                print_sequences(e, i2w)

            print_training_progress(trainer, mbs,
                                    training_progress_output_freq)
            i += mb_train[raw_labels].num_samples
            mbs += 1

        print("--- EPOCH %d DONE: loss = %f, errs = %f ---" %
              (epoch, loss_numer / denom, 100.0 * (metric_numer / denom)))

    error1 = translator_test_error(z, trainer, input_vocab_dim,
                                   label_vocab_dim)

    save_model(z, "seq2seq.dnn")
    z = load_model("seq2seq.dnn")

    label_seq_axis = Axis('labelAxis')
    label_sequence = sequence.slice(find_arg_by_name('raw_labels', z), 1, 0)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)
    trainer = Trainer(z, ce, errs, [
        momentum_sgd(z.parameters, lr_per_minibatch, momentum_time_constant,
                     clipping_threshold_per_sample,
                     gradient_clipping_with_truncation)
    ])

    error2 = translator_test_error(z, trainer, input_vocab_dim,
                                   label_vocab_dim)

    assert error1 == error2

    return error1
def create_network(input_vocab_dim, label_vocab_dim):
    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')
    raw_input = sequence.input(shape=(input_vocab_dim), sequence_axis=input_seq_axis, name='raw_input')
    raw_labels = sequence.input(shape=(label_vocab_dim), sequence_axis=label_seq_axis, name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)        # <s>

    is_first_label = sequence.is_first(label_sequence)       # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(
        label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(
        thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(
        thought_vectorC, label_sequence)

    # Decoder
    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence

    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
        decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})

    return {
        'raw_input' : raw_input,
        'raw_labels' : raw_labels,
        'ce' : ce,
        'pe' : errs,
        'ng' : ng,
        'output': z
    }
예제 #15
0
ix_to_char = { i:ch for i,ch in enumerate(chars) }


minibatch_size=100
def sample(p):
    xi = [char_to_ix[ch] for ch in data[p:p+minibatch_size]]
    yi = [char_to_ix[ch] for ch in data[p+1:p+minibatch_size+1]]
    
    X = np.eye(vocab_size, dtype=np.float32)[xi]
    Y = np.eye(vocab_size, dtype=np.float32)[yi]

    return [X], [Y]
sample(0)


input_seq_axis = Axis('inputAxis')
input_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis)
label_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis)

# model = Sequential([Dense(300),Dense(vocab_size)])

model = Sequential([
        For(range(2), lambda:
                   Sequential([Stabilizer(), Recurrence(LSTM(256), go_backwards=False)])),
        Dense(vocab_size)])

z = model(input_sequence)

ce = cross_entropy_with_softmax(z, label_sequence)
errs = classification_error(z, label_sequence)
예제 #16
0
def sequence_to_sequence_translator(debug_output=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    hidden_dim = 512
    num_layers = 2

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes)

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(
        shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes)

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1, 0)
    label_sentence_start = sequence.first(raw_labels)

    is_first_label = sequence.is_first(label_sequence)
    label_sentence_start_scattered = sequence.scatter(
        label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(
        thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(
        thought_vectorC, label_sequence)

    # Decoder
    decoder_history_from_ground_truth = label_sequence
    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
        decoder_history_from_ground_truth))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH
    decoder_dim = hidden_dim

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    momentum_time_constant = 1100
    m_schedule = momentum_schedule(momentum_time_constant)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True

    trainer = Trainer(z, ce, errs, [momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)])                   

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    mb_source = text_format_minibatch_source(path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000)
    features_si = mb_source[feature_stream_name]
    labels_si = mb_source[labels_stream_name]

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 72
    training_progress_output_freq = 30
    if debug_output:
        training_progress_output_freq = training_progress_output_freq/3

    while True:
        mb = mb_source.next_minibatch(minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be trained with
        arguments = {raw_input: mb[features_si],
                     raw_labels: mb[labels_si]}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)
        i += 1

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)

    test_mb_source = text_format_minibatch_source(path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000, False)
    features_si = test_mb_source[feature_stream_name]
    labels_si = test_mb_source[labels_stream_name]

    # choose this to be big enough for the longest sentence
    train_minibatch_size = 1024 

    # Get minibatches of sequences to test and perform testing
    i = 0
    total_error = 0.0
    while True:
        mb = test_mb_source.next_minibatch(train_minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be tested with
        arguments = {raw_input: mb[features_si],
                     raw_labels: mb[labels_si]}
        mb_error = trainer.test_minibatch(arguments)

        total_error += mb_error

        if debug_output:
            print("Minibatch {}, Error {} ".format(i, mb_error))

        i += 1

    # Average of evaluation errors of all test minibatches
    return total_error / i
예제 #17
0
def train_sequence_to_sequence_translator():

    input_vocab_dim = 69
    label_vocab_dim = 69

    hidden_dim = 512
    num_layers = 2

    # Source and target inputs to the model
    input_dynamic_axes = [ Axis('inputAxis'), Axis.default_batch_axis() ]
    raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes = input_dynamic_axes)

    label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ]
    raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes)

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_dynamic_axes[0], 1, 0)
    label_sentence_start = sequence.first(raw_labels)

    is_first_label = sequence.is_first(label_sequence)
    label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(encoder_outputH, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence)
    
    # Decoder
    decoder_history_from_ground_truth = label_sequence
    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i == 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(decoder_outputH, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH
    decoder_dim = hidden_dim

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    mb_source = text_format_minibatch_source(path, [ 
                    StreamConfiguration( feature_stream_name, input_vocab_dim, True, 'S0' ), 
                    StreamConfiguration( labels_stream_name, label_vocab_dim, True, 'S1') ], 10000)
    features_si = mb_source.stream_info(feature_stream_name)
    labels_si = mb_source.stream_info(labels_stream_name)

    # Instantiate the trainer object to drive the model training
    lr = learning_rates_per_sample(0.007)
    momentum_time_constant = 1100
    momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant))
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True

    trainer = Trainer(z, ce, errs, [momentum_sgd_learner(z.owner.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation)])                   

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 72
    training_progress_output_freq = 10
    while True:
        mb = mb_source.get_next_minibatch(minibatch_size)
        if  len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        arguments = {raw_input : mb[features_si].m_data, raw_labels : mb[labels_si].m_data}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)

        i += 1
예제 #18
0
def sequence_to_sequence_translator(debug_output=False, run_test=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1,
                           0)  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    is_first_label = sequence.is_first(label_sequence)  # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             encoder_outputH.output, hidden_dim, hidden_dim, future_value,
             future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH,
                                                      label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC,
                                                      label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH,
             recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share,
                 {decoder_history_hook.output: net_output.output})

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    minibatch_size = 72
    momentum_time_constant = 1100
    m_schedule = momentum_schedule(momentum_time_constant)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr, m_schedule,
                           clipping_threshold_per_sample,
                           gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    # setup data
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "tiny.ctf")

    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    # readers
    randomize_data = True
    if run_test:
        randomize_data = False  # because we want to get an exact error
    train_reader = text_format_minibatch_source(train_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                                randomize=randomize_data)
    features_si_tr = train_reader.stream_info(feature_stream_name)
    labels_si_tr = train_reader.stream_info(labels_stream_name)

    valid_reader = text_format_minibatch_source(valid_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                                randomize=False)
    features_si_va = valid_reader.stream_info(feature_stream_name)
    labels_si_va = valid_reader.stream_info(labels_stream_name)

    # get the vocab for printing output sequences in plaintext
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping"
    vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    vocab = [w.strip() for w in open(vocab_path).readlines()]
    i2w = {i: ch for i, ch in enumerate(vocab)}

    # Get minibatches of sequences to train with and perform model training
    i = 0
    mbs = 0
    epoch_size = 908241
    max_epochs = 10
    training_progress_output_freq = 500

    # make things more basic for running a quicker test
    if run_test:
        epoch_size = 5000
        max_epochs = 1
        training_progress_output_freq = 30

    for epoch in range(max_epochs):
        loss_numer = 0
        metric_numer = 0
        denom = 0

        while i < (epoch + 1) * epoch_size:

            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size)

            train_args = {
                'raw_input': mb_train[features_si_tr],
                'raw_labels': mb_train[labels_si_tr]
            }
            trainer.train_minibatch(train_args)

            # collect epoch-wide stats
            samples = trainer.previous_minibatch_sample_count
            loss_numer += trainer.previous_minibatch_loss_average * samples
            metric_numer += trainer.previous_minibatch_evaluation_average * samples
            denom += samples

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % training_progress_output_freq == 0:
                mb_valid = valid_reader.next_minibatch(minibatch_size)
                valid_args = {
                    'raw_input': mb_valid[features_si_va],
                    'raw_labels': mb_valid[labels_si_va]
                }

                e = ng.eval(valid_args)
                print_sequences(e, i2w)

            print_training_progress(trainer, mbs,
                                    training_progress_output_freq)
            i += mb_train[labels_si_tr].num_samples
            mbs += 1

        print("--- EPOCH %d DONE: loss = %f, errs = %f ---" %
              (epoch, loss_numer / denom, 100.0 * (metric_numer / denom)))

    # now setup a test run
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf"
    test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             rel_path)

    test_reader = text_format_minibatch_source(test_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                               10000,
                                               randomize=False)
    features_si_te = test_reader.stream_info(feature_stream_name)
    labels_si_te = test_reader.stream_info(labels_stream_name)

    test_minibatch_size = 1024

    # Get minibatches of sequences to test and perform testing
    i = 0
    total_error = 0.0
    while True:
        mb = test_reader.next_minibatch(test_minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be tested with
        arguments = {
            raw_input: mb[features_si_te],
            raw_labels: mb[labels_si_te]
        }
        mb_error = trainer.test_minibatch(arguments)

        total_error += mb_error

        if debug_output:
            print("Minibatch {}, Error {} ".format(i, mb_error))

        i += 1

    # Average of evaluation errors of all test minibatches
    return total_error / i
예제 #19
0
        features=StreamDef(field="S0", shape=input_vocab_dim, is_sparse=True),
        labels=StreamDef(field="S1", shape=label_vocab_dim, is_sparse=True),
        ),
    ),
    randomize=is_training,
    max_sweeps=INFINITELY_REPEAT if is_training else 1,
  )


########################
# define the model     #
########################

# type annotations for the two sequence types; later use InputSequence[Tensor[input_vocab_dim]]
# CNTK considers these two different types since they run over different sequence indices.
inputAxis = Axis("inputAxis")
labelAxis = Axis("labelAxis")
InputSequence = SequenceOver[inputAxis]
LabelSequence = SequenceOver[labelAxis]

# create the s2s model
def create_model():  # :: (history*, input*) -> logP(w)*
    # Embedding: (input*) --> embedded_input*
    # Right now assumes shared embedding and shared vocab size.
    embed = Embedding(embedding_dim, name="embed") if use_embedding else identity

    # Encoder: (input*) --> (h0, c0)
    # Create multiple layers of LSTMs by passing the output of the i-th layer
    # to the (i+1)th layer as its input
    # This is the plain s2s encoder. The attention encoder will keep the entire sequence instead.
    # Note: We go_backwards for the plain model, but forward for the attention model.
예제 #20
0
# Train data reader
train_reader = create_reader(train_file, True)

# Validation/Test data reader
valid_reader = create_reader(valid_file, False)
model_dir = "."  # we downloaded our data to the local directory above # TODO check me

# model dimensions
input_vocab_dim = input_vocab_size
label_vocab_dim = label_vocab_size
hidden_dim = 128
num_layers = 1
# Source and target inputs to the model
batch_axis = Axis.default_batch_axis()
input_seq_axis = Axis('inputAxis')
label_seq_axis = Axis('labelAxis')

input_dynamic_axes = [batch_axis, input_seq_axis]
raw_input = input_variable(shape=(input_vocab_dim),
                           dynamic_axes=input_dynamic_axes,
                           name='raw_input')

label_dynamic_axes = [batch_axis, label_seq_axis]
raw_labels = input_variable(shape=(label_vocab_dim),
                            dynamic_axes=label_dynamic_axes,
                            name='raw_labels')
# Instantiate the sequence to sequence translation model
input_sequence = raw_input

# Drop the sentence start token from the label, for decoder training
예제 #21
0
def create_model():

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = sequence.slice(
        raw_labels, 1, 0,
        name='label_sequence')  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    # Setup primer for decoder
    is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    stabilize = Stabilizer()
    encoder_output_h = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_output_h,
         encoder_output_c) = LSTM_layer(encoder_output_h.output, hidden_dim,
                                        future_value, future_value)

    # Prepare encoder output to be used in decoder
    thought_vector_h = sequence.first(encoder_output_h)
    thought_vector_c = sequence.first(encoder_output_c)

    thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h,
                                                       label_sequence)
    thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c,
                                                       label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_output_h = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hook_h = past_value
            recurrence_hook_c = past_value
        else:
            recurrence_hook_h = lambda operand: element_select(
                is_first_label, thought_vector_broadcast_h, past_value(operand)
            )
            recurrence_hook_c = lambda operand: element_select(
                is_first_label, thought_vector_broadcast_c, past_value(operand)
            )

        (decoder_output_h,
         decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim,
                                        recurrence_hook_h, recurrence_hook_c)

    # Linear output layer
    W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim),
                  init=glorot_uniform())
    B = parameter(shape=(label_vocab_dim), init=0)
    z = plus(B, times(stabilize(decoder_output_h), W))

    return z