def test_cos_distane_backward(): x = sequence.input(shape=(2, ), sequence_axis=Axis("B"), needs_gradient=True) y = sequence.input(shape=(2, ), sequence_axis=Axis("B"), needs_gradient=True) z = cosine_distance(x, y) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-0.5, 1.5, -0.3, -1]), (1, 2, 2)) bwd, fwd = z.forward({x: a, y: b}, [z.output], set([z.output])) value = list(fwd.values())[0] expected = [[0.707107, -0.981665]] assert np.allclose(value, expected) grad = z.backward(bwd, {z.output: np.ones_like(value)}, set([x, y])) x_driv_expected = np.ndarray( (1, 2, 2), dtype=np.float32, buffer=np.float32([-1.131371, 0.565686, -0.188727, 0.018873])) y_driv_expected = np.ndarray( (1, 2, 2), dtype=np.float32, buffer=np.float32([0.424264, 0.141421, -0.174876, 0.052463])) assert (np.all(np.absolute(grad[x] - x_driv_expected) < 1e-6)) assert (np.all(np.absolute(grad[y] - y_driv_expected) < 1e-6))
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input_variable, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, epoch_size = 2) batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=input_vocab_dim, dynamic_axes=input_dynamic_axes, name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def test_rank0_output(): x = sequence.input(shape=(768,), sequence_axis=Axis("B"), needs_gradient=True) y = sequence.input(shape=(768,), sequence_axis=Axis("B"), needs_gradient=True) z = cosine_distance(x, y) batch_num = 2 batch_size = 30 a = np.float32(np.random.rand(batch_num*batch_size,1500,768)) b = np.float32(np.random.rand(batch_num*batch_size,1500,768)) for i in range(batch_num): bwd, fwd = z.forward({x:a[i*batch_size:(i+1)*batch_size], y:b[i*batch_size:(i+1)*batch_size]}, [z.output], set([z.output])) grad = z.backward(bwd, {z.output:np.ones_like(fwd[z.output])}, set([x, y]))
def test_recurrence(): inputAxis = Axis('inputAxis') stateAxis = Axis('stateAxis') InputSequence = SequenceOver[inputAxis] StateSequence = SequenceOver[stateAxis] # input and expected for both tests below x = np.reshape(np.arange(0, 25, dtype=np.float32), (1, 5, 5)) exp = [[0.239151, 0.239151, 0.239151, 0.239151, 0.239151], [0.338713, 0.338713, 0.338713, 0.338713, 0.338713], [0.367456, 0.367456, 0.367456, 0.367456, 0.367456], [0.375577, 0.375577, 0.375577, 0.375577, 0.375577], [0.377891, 0.377891, 0.377891, 0.377891, 0.377891]] #################################################### # Test 1: Recurrence(): initial state is constant #################################################### # Note: We cannot use random init of the GRU parameters because random numbers will # depend on what previous tests were run. Hence, use a constant (which is not realistic). # TODO: Find out how to reset the random generator, then remove the constant init. R = Recurrence(GRU(5, init=0.05), go_backwards=False, initial_state=0.1) @Function @Signature(InputSequence[Tensor[5]]) def F(x): return R(x) rt = F(x) np.testing.assert_array_almost_equal( rt[0], exp, decimal=6, err_msg='Error in Recurrence(GRU()) forward') #################################################### # Test 2: RecurrenceFrom(): initial state is data input #################################################### RF = RecurrenceFrom(GRU(5, init=0.05), go_backwards=False) @Function @Signature(s=StateSequence[Tensor[5]], x=InputSequence[Tensor[5]]) def FF(s, x): return RF(s, x) s = np.ones( (1, 5, 5) ) * 0.1 # we pass the same value as the constant in the previous test to make the result the same rt = FF(s, x) np.testing.assert_array_almost_equal( rt[0], exp, decimal=6, err_msg='Error in RecurrenceFrom(GRU()) forward')
def create_inputs(vocab_dim): input_seq_axis = Axis('inputAxis') input_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis) label_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis) return input_sequence, label_sequence
def test_cos_distane_backward2(): x = sequence.input(shape=(100,), sequence_axis=Axis("B"), needs_gradient=True) y = sequence.input(shape=(100,), sequence_axis=Axis("B"), needs_gradient=True) z = cosine_distance(x, y); np.random.seed(0) a = np.float32(np.random.rand(10,50,100)) b = np.float32(np.random.rand(10,50,100)) bwd, fwd = z.forward({x:a, y:b}, [z.output], set([z.output])) value = list(fwd.values())[0] expected_cos = numpy_cos(a,b) expected = expected_cos.forward() assert np.allclose(value, expected) grad = z.backward(bwd, {z.output:np.ones_like(value)}, set([x, y])) bwd = expected_cos.backward() x_driv_expected = bwd['a'] y_driv_expected = bwd['b'] assert (np.all(np.absolute(grad[x]-x_driv_expected) < 1e-6)) assert (np.all(np.absolute(grad[y]-y_driv_expected) < 1e-6))
def create_inputs(vocab_dim): input_seq_axis = Axis('inputAxis') input_sequence = sequence.input(shape=vocab_dim, sequence_axis=input_seq_axis, is_sparse=use_sparse) label_sequence = sequence.input(shape=vocab_dim, sequence_axis=input_seq_axis, is_sparse=use_sparse) return input_sequence, label_sequence
def create_inputs(hidden_dim, sv_dim, vocab_dim): input_seq_axis = Axis('inputAxis') input_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis) # state in model, including sv vector, h vector, c vector sv_pair = C.input_variable(shape=sv_dim) inputH = C.input_variable(shape=hidden_dim) inputC = C.input_variable(shape=hidden_dim) label_sequence = sequence.input_variable(shape=vocab_dim, sequence_axis=input_seq_axis) return input_sequence, sv_pair, label_sequence, inputH, inputC
def create_inputs(vocab_dim): batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') input_dynamic_axes = [batch_axis, input_seq_axis] input_sequence = input_variable(shape=vocab_dim, dynamic_axes=input_dynamic_axes) label_sequence = input_variable(shape=vocab_dim, dynamic_axes=input_dynamic_axes) return input_sequence, label_sequence
def test_cosine_distance(): a = np.reshape(np.arange(25.0, dtype=np.float32), (5, 5)) b = np.reshape(np.arange(0, 5, dtype=np.float32), (1, 5)) src = sequence.input(shape=(5), sequence_axis=Axis("Seq")) tgt = input(shape=(5)) tgt_br = sequence.broadcast_as(tgt, src) cos_seq = cosine_distance(src, tgt_br) assert len(cos_seq.dynamic_axes) == 2 assert cos_seq.dynamic_axes[1].name == "Seq" val = cos_seq.eval({src: [a], tgt: [b]}) expected = [[1., 0.914659, 0.878459, 0.86155, 0.851852]] assert np.allclose(val, expected)
def test_recurrent_block(block_type, block_outputs_count, block_size, W_mult, H_mult, expected_res): input_shape = 4 sequenceAxis = Axis('sequenceAxis') y = input(input_shape, dynamic_axes=[Axis.default_batch_axis(), sequenceAxis]) data = np.reshape(np.arange(0,16, dtype=np.float32), (1,4,4)) rnn_block = block_type(block_size, init=0.1) assert len(rnn_block.outputs) == block_outputs_count rnn_net = Recurrence(rnn_block)(y) assert rnn_net.b.shape == (W_mult*block_size,) assert rnn_net.W.shape == (input_shape, W_mult*block_size) assert rnn_net.H.shape == (block_size, H_mult*block_size) res = rnn_net.eval(data) expected = np.asarray(expected_res, dtype=np.float32) np.testing.assert_array_almost_equal(res[0], expected, decimal=6)
shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True))), randomize=is_training, epoch_size=INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP) ######################## # define the model # ######################## # type annotations for the two sequence types; later use InputSequence[Tensor[input_vocab_dim]] # CNTK considers these two different types since they run over different sequence indices. inputAxis = Axis('inputAxis') labelAxis = Axis('labelAxis') InputSequence = SequenceOver[inputAxis] LabelSequence = SequenceOver[labelAxis] # create the s2s model def create_model(): # :: (history*, input*) -> logP(w)* # Embedding: (input*) --> embedded_input* # Right now assumes shared embedding and shared vocab size. embed = Embedding(embedding_dim, name='embed') if use_embedding else identity # Encoder: (input*) --> (h0, c0) # Create multiple layers of LSTMs by passing the output of the i-th layer # to the (i+1)th layer as its input
def sequence_to_sequence_translator(debug_output=False, run_test=False): input_vocab_dim = 69 label_vocab_dim = 69 # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output: net_output.output}) # Instantiate the trainer object to drive the model training lr_per_minibatch = learning_rate_schedule(0.5, UnitType.minibatch) momentum_time_constant = momentum_as_time_constant_schedule(1100) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True learner = momentum_sgd( z.parameters, lr_per_minibatch, momentum_time_constant, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, ce, errs, learner) # setup data train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data", "cmudict-0.7b.train-dev-20-21.ctf") valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data", "tiny.ctf") # readers randomize_data = True if run_test: randomize_data = False # because we want to get an exact error train_reader = create_reader(train_path, randomize_data, input_vocab_dim, label_vocab_dim) train_bind = { raw_input: train_reader.streams.features, raw_labels: train_reader.streams.labels } # get the vocab for printing output sequences in plaintext vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data", "cmudict-0.7b.mapping") vocab = [w.strip() for w in open(vocab_path).readlines()] i2w = {i: ch for i, ch in enumerate(vocab)} # Get minibatches of sequences to train with and perform model training i = 0 mbs = 0 minibatch_size = 72 epoch_size = 908241 max_epochs = 10 training_progress_output_freq = 500 # make things more basic for running a quicker test if run_test: epoch_size = 5000 max_epochs = 1 training_progress_output_freq = 30 valid_reader = create_reader(valid_path, False, input_vocab_dim, label_vocab_dim) valid_bind = { find_arg_by_name('raw_input', ng): valid_reader.streams.features, find_arg_by_name('raw_labels', ng): valid_reader.streams.labels } for epoch in range(max_epochs): loss_numer = 0 metric_numer = 0 denom = 0 while i < (epoch + 1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind) trainer.train_minibatch(mb_train) # collect epoch-wide stats samples = trainer.previous_minibatch_sample_count loss_numer += trainer.previous_minibatch_loss_average * samples metric_numer += trainer.previous_minibatch_evaluation_average * samples denom += samples # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % training_progress_output_freq == 0: mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind) e = ng.eval(mb_valid) print_sequences(e, i2w) print_training_progress(trainer, mbs, training_progress_output_freq) i += mb_train[raw_labels].num_samples mbs += 1 print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer / denom, 100.0 * (metric_numer / denom))) error1 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) save_model(z, "seq2seq.dnn") z = load_model("seq2seq.dnn") label_seq_axis = Axis('labelAxis') label_sequence = sequence.slice(find_arg_by_name('raw_labels', z), 1, 0) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) trainer = Trainer(z, ce, errs, [ momentum_sgd(z.parameters, lr_per_minibatch, momentum_time_constant, clipping_threshold_per_sample, gradient_clipping_with_truncation) ]) error2 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim) assert error1 == error2 return error1
def create_network(input_vocab_dim, label_vocab_dim): # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') raw_input = sequence.input(shape=(input_vocab_dim), sequence_axis=input_seq_axis, name='raw_input') raw_labels = sequence.input(shape=(label_vocab_dim), sequence_axis=label_seq_axis, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output}) return { 'raw_input' : raw_input, 'raw_labels' : raw_labels, 'ce' : ce, 'pe' : errs, 'ng' : ng, 'output': z }
ix_to_char = { i:ch for i,ch in enumerate(chars) } minibatch_size=100 def sample(p): xi = [char_to_ix[ch] for ch in data[p:p+minibatch_size]] yi = [char_to_ix[ch] for ch in data[p+1:p+minibatch_size+1]] X = np.eye(vocab_size, dtype=np.float32)[xi] Y = np.eye(vocab_size, dtype=np.float32)[yi] return [X], [Y] sample(0) input_seq_axis = Axis('inputAxis') input_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis) label_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis) # model = Sequential([Dense(300),Dense(vocab_size)]) model = Sequential([ For(range(2), lambda: Sequential([Stabilizer(), Recurrence(LSTM(256), go_backwards=False)])), Dense(vocab_size)]) z = model(input_sequence) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence)
def sequence_to_sequence_translator(debug_output=False): input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes) label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable( shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes) # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) label_sentence_start = sequence.first(raw_labels) is_first_label = sequence.is_first(label_sequence) label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_from_ground_truth = label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_from_ground_truth)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH decoder_dim = hidden_dim # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # Instantiate the trainer object to drive the model training lr = 0.007 momentum_time_constant = 1100 m_schedule = momentum_schedule(momentum_time_constant) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True trainer = Trainer(z, ce, errs, [momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)]) rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) feature_stream_name = 'features' labels_stream_name = 'labels' mb_source = text_format_minibatch_source(path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000) features_si = mb_source[feature_stream_name] labels_si = mb_source[labels_stream_name] # Get minibatches of sequences to train with and perform model training minibatch_size = 72 training_progress_output_freq = 30 if debug_output: training_progress_output_freq = training_progress_output_freq/3 while True: mb = mb_source.next_minibatch(minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be trained with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) i += 1 rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) test_mb_source = text_format_minibatch_source(path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000, False) features_si = test_mb_source[feature_stream_name] labels_si = test_mb_source[labels_stream_name] # choose this to be big enough for the longest sentence train_minibatch_size = 1024 # Get minibatches of sequences to test and perform testing i = 0 total_error = 0.0 while True: mb = test_mb_source.next_minibatch(train_minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be tested with arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]} mb_error = trainer.test_minibatch(arguments) total_error += mb_error if debug_output: print("Minibatch {}, Error {} ".format(i, mb_error)) i += 1 # Average of evaluation errors of all test minibatches return total_error / i
def train_sequence_to_sequence_translator(): input_vocab_dim = 69 label_vocab_dim = 69 hidden_dim = 512 num_layers = 2 # Source and target inputs to the model input_dynamic_axes = [ Axis('inputAxis'), Axis.default_batch_axis() ] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes = input_dynamic_axes) label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes) # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_dynamic_axes[0], 1, 0) label_sentence_start = sequence.first(raw_labels) is_first_label = sequence.is_first(label_sequence) label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(encoder_outputH, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_from_ground_truth = label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i == 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(decoder_outputH, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH decoder_dim = hidden_dim # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) feature_stream_name = 'features' labels_stream_name = 'labels' mb_source = text_format_minibatch_source(path, [ StreamConfiguration( feature_stream_name, input_vocab_dim, True, 'S0' ), StreamConfiguration( labels_stream_name, label_vocab_dim, True, 'S1') ], 10000) features_si = mb_source.stream_info(feature_stream_name) labels_si = mb_source.stream_info(labels_stream_name) # Instantiate the trainer object to drive the model training lr = learning_rates_per_sample(0.007) momentum_time_constant = 1100 momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant)) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True trainer = Trainer(z, ce, errs, [momentum_sgd_learner(z.owner.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation)]) # Get minibatches of sequences to train with and perform model training minibatch_size = 72 training_progress_output_freq = 10 while True: mb = mb_source.get_next_minibatch(minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual minibatch data to be trained with arguments = {raw_input : mb[features_si].m_data, raw_labels : mb[labels_si].m_data} trainer.train_minibatch(arguments) print_training_progress(trainer, i, training_progress_output_freq) i += 1
def sequence_to_sequence_translator(debug_output=False, run_test=False): input_vocab_dim = 69 label_vocab_dim = 69 # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output: net_output.output}) # Instantiate the trainer object to drive the model training lr = 0.007 minibatch_size = 72 momentum_time_constant = 1100 m_schedule = momentum_schedule(momentum_time_constant) clipping_threshold_per_sample = 2.3 gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation) trainer = Trainer(z, ce, errs, learner) # setup data rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf" train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tiny.ctf") feature_stream_name = 'features' labels_stream_name = 'labels' # readers randomize_data = True if run_test: randomize_data = False # because we want to get an exact error train_reader = text_format_minibatch_source(train_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], randomize=randomize_data) features_si_tr = train_reader.stream_info(feature_stream_name) labels_si_tr = train_reader.stream_info(labels_stream_name) valid_reader = text_format_minibatch_source(valid_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], randomize=False) features_si_va = valid_reader.stream_info(feature_stream_name) labels_si_va = valid_reader.stream_info(labels_stream_name) # get the vocab for printing output sequences in plaintext rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping" vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) vocab = [w.strip() for w in open(vocab_path).readlines()] i2w = {i: ch for i, ch in enumerate(vocab)} # Get minibatches of sequences to train with and perform model training i = 0 mbs = 0 epoch_size = 908241 max_epochs = 10 training_progress_output_freq = 500 # make things more basic for running a quicker test if run_test: epoch_size = 5000 max_epochs = 1 training_progress_output_freq = 30 for epoch in range(max_epochs): loss_numer = 0 metric_numer = 0 denom = 0 while i < (epoch + 1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size) train_args = { 'raw_input': mb_train[features_si_tr], 'raw_labels': mb_train[labels_si_tr] } trainer.train_minibatch(train_args) # collect epoch-wide stats samples = trainer.previous_minibatch_sample_count loss_numer += trainer.previous_minibatch_loss_average * samples metric_numer += trainer.previous_minibatch_evaluation_average * samples denom += samples # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % training_progress_output_freq == 0: mb_valid = valid_reader.next_minibatch(minibatch_size) valid_args = { 'raw_input': mb_valid[features_si_va], 'raw_labels': mb_valid[labels_si_va] } e = ng.eval(valid_args) print_sequences(e, i2w) print_training_progress(trainer, mbs, training_progress_output_freq) i += mb_train[labels_si_tr].num_samples mbs += 1 print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer / denom, 100.0 * (metric_numer / denom))) # now setup a test run rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf" test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) test_reader = text_format_minibatch_source(test_path, [ StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'), StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1') ], 10000, randomize=False) features_si_te = test_reader.stream_info(feature_stream_name) labels_si_te = test_reader.stream_info(labels_stream_name) test_minibatch_size = 1024 # Get minibatches of sequences to test and perform testing i = 0 total_error = 0.0 while True: mb = test_reader.next_minibatch(test_minibatch_size) if len(mb) == 0: break # Specify the mapping of input variables in the model to actual # minibatch data to be tested with arguments = { raw_input: mb[features_si_te], raw_labels: mb[labels_si_te] } mb_error = trainer.test_minibatch(arguments) total_error += mb_error if debug_output: print("Minibatch {}, Error {} ".format(i, mb_error)) i += 1 # Average of evaluation errors of all test minibatches return total_error / i
features=StreamDef(field="S0", shape=input_vocab_dim, is_sparse=True), labels=StreamDef(field="S1", shape=label_vocab_dim, is_sparse=True), ), ), randomize=is_training, max_sweeps=INFINITELY_REPEAT if is_training else 1, ) ######################## # define the model # ######################## # type annotations for the two sequence types; later use InputSequence[Tensor[input_vocab_dim]] # CNTK considers these two different types since they run over different sequence indices. inputAxis = Axis("inputAxis") labelAxis = Axis("labelAxis") InputSequence = SequenceOver[inputAxis] LabelSequence = SequenceOver[labelAxis] # create the s2s model def create_model(): # :: (history*, input*) -> logP(w)* # Embedding: (input*) --> embedded_input* # Right now assumes shared embedding and shared vocab size. embed = Embedding(embedding_dim, name="embed") if use_embedding else identity # Encoder: (input*) --> (h0, c0) # Create multiple layers of LSTMs by passing the output of the i-th layer # to the (i+1)th layer as its input # This is the plain s2s encoder. The attention encoder will keep the entire sequence instead. # Note: We go_backwards for the plain model, but forward for the attention model.
# Train data reader train_reader = create_reader(train_file, True) # Validation/Test data reader valid_reader = create_reader(valid_file, False) model_dir = "." # we downloaded our data to the local directory above # TODO check me # model dimensions input_vocab_dim = input_vocab_size label_vocab_dim = label_vocab_size hidden_dim = 128 num_layers = 1 # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training
def create_model(): # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice( raw_labels, 1, 0, name='label_sequence') # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> # Setup primer for decoder is_first_label = sequence.is_first(label_sequence) # 1 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder stabilize = Stabilizer() encoder_output_h = stabilize(input_sequence) for i in range(0, num_layers): (encoder_output_h, encoder_output_c) = LSTM_layer(encoder_output_h.output, hidden_dim, future_value, future_value) # Prepare encoder output to be used in decoder thought_vector_h = sequence.first(encoder_output_h) thought_vector_c = sequence.first(encoder_output_c) thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h, label_sequence) thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_output_h = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hook_h = past_value recurrence_hook_c = past_value else: recurrence_hook_h = lambda operand: element_select( is_first_label, thought_vector_broadcast_h, past_value(operand) ) recurrence_hook_c = lambda operand: element_select( is_first_label, thought_vector_broadcast_c, past_value(operand) ) (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c) # Linear output layer W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform()) B = parameter(shape=(label_vocab_dim), init=0) z = plus(B, times(stabilize(decoder_output_h), W)) return z