def implementing_1d_convnet_cntk(): max_features = 10000 # number of words to consider as features max_len = 500 # cut texts after this number of words (among top max_features most common words) x_train, y_train, x_test, y_test = load_data(max_features, max_len) model = build_model_cntk(max_features, max_len) x = cntk.input_variable(shape=(max_len, ), dtype=np.float32) y = cntk.input_variable(shape=(1, ), dtype=np.float32) model.replace_placeholders({model.placeholders[0]: x}) loss_function = cntk.binary_cross_entropy(model.output, y) round_predictions = cntk.round(model.output) equal_elements = cntk.equal(round_predictions, y) accuracy_function = cntk.reduce_mean(equal_elements, axis=0) max_epochs = 10 batch_size = 32 learner = cntk.adam(model.parameters, cntk.learning_parameter_schedule_per_sample(0.0001), cntk.learning_parameter_schedule_per_sample(0.99)) progress_printer = cntk.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = cntk.Trainer(model, (loss_function, accuracy_function), [learner], progress_printer) evaluator = cntk.Evaluator(accuracy_function) cntk_train(x, y, x_train, y_train, max_epochs, batch_size, trainer, evaluator)
def learning_word_embeddings_with_the_embedding_layer_cntk(): x_train, y_train, x_test, y_test = load_from_files() max_features = 10000 maxlen = 20 embedding_dim = 8 x = cntk.input_variable(shape=(maxlen, ), dtype=np.float32) y = cntk.input_variable(shape=(1, ), dtype=np.float32) model = cntk.one_hot(x, num_classes=max_features, sparse_output=True) model = cntk.layers.Embedding(embedding_dim)(model) model = cntk.layers.Dense(1, activation=cntk.sigmoid)(model) loss_function = cntk.binary_cross_entropy(model.output, y) round_predictions = cntk.round(model.output) equal_elements = cntk.equal(round_predictions, y) accuracy_function = cntk.reduce_mean(equal_elements, axis=0) max_epochs = 30 batch_size = 32 learner = cntk.adam(model.parameters, cntk.learning_parameter_schedule_per_sample(0.0001), cntk.learning_parameter_schedule_per_sample(0.99)) progress_printer = cntk.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = cntk.Trainer(model, (loss_function, accuracy_function), [learner], progress_printer) evaluator = cntk.Evaluator(accuracy_function) cntk_train(x, y, x_train, y_train, max_epochs, batch_size, trainer, evaluator)
def build_graph(noise_shape, image_shape, G_progress_printer, D_progress_printer): input_dynamic_axes = [C.Axis.default_batch_axis()] Z = C.input_variable(noise_shape, dynamic_axes=input_dynamic_axes) X_real = C.input_variable(image_shape, dynamic_axes=input_dynamic_axes) X_real_scaled = 2*(X_real / 255.0) - 1.0 # Create the model function for the generator and discriminator models X_fake = generator(Z) D_real = discriminator(X_real_scaled) D_fake = D_real.clone( method = 'share', substitutions = {X_real_scaled.output: X_fake.output} ) # Create loss functions and configure optimazation algorithms G_loss = 1.0 - C.log(D_fake) D_loss = -(C.log(D_real) + C.log(1.0 - D_fake)) G_learner = C.fsadagrad( parameters = X_fake.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) D_learner = C.fsadagrad( parameters = D_real.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) DistG_learner = C.train.distributed.data_parallel_distributed_learner(G_learner) # The following API marks a learner as the matric aggregator, which is used by # the trainer to determine the training progress. # It is required, only when more than one learner is provided to a *single* trainer. # In this example, we use two trainers each with a single learner, so it # is not required and automatically set by CNTK for each single learner. However, if you # plan to use both learners with a single trainer, then it needs to be call before # creating the trainer. #DistG_learner.set_as_metric_aggregator() DistD_learner = C.train.distributed.data_parallel_distributed_learner(D_learner) # Instantiate the trainers G_trainer = C.Trainer( X_fake, (G_loss, None), DistG_learner, G_progress_printer ) D_trainer = C.Trainer( D_real, (D_loss, None), DistD_learner, D_progress_printer ) return X_real, X_fake, Z, G_trainer, D_trainer
def run_cntk(): text, chars, char_indices, x_train, y_train = get_data(one_hot_encode_features=False) alphabet_size = len(chars) print('alphabet_size=', alphabet_size) model = build_model_cntk(alphabet_size=alphabet_size) model_filename = 'ch8-1_cntk.model' model.save(model_filename) model = None model = cntk.load_model(model_filename) x = cntk.sequence.input_variable(shape=(), dtype=np.float32) y = cntk.input_variable(shape=(), dtype=np.float32) model.replace_placeholders({model.placeholders[0]: x}) y_oneHot = cntk.one_hot(y, num_classes=alphabet_size) loss_function = cntk.cross_entropy_with_softmax(model.output, y_oneHot) learner = cntk.adam(model.parameters, cntk.learning_parameter_schedule_per_sample(0.001), cntk.learning_parameter_schedule_per_sample(0.9)) trainer = cntk.Trainer(model, (loss_function, loss_function), [learner],) for epoch in range(1, 60): print('epoch', epoch) cntk_train(x, y, x_train, y_train, max_epochs=32, batch_size=128, trainer=trainer) model_filename = 'final_ch8-1_cntk.model' model.save(model_filename) generate_text_cntk(char_indices, chars, model, text)
def test_noise_injection_with_checkpointing(): from cntk import initializer shape = (100,100) w1 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w2 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w3 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) lr=C.learning_parameter_schedule_per_sample(0.5) m=C.momentum_schedule(0.99) learner1 = C.momentum_sgd([w1], lr, m, gaussian_noise_injection_std_dev=0.5) learner2 = C.momentum_sgd([w2], lr, m, gaussian_noise_injection_std_dev=0.5) learner3 = C.momentum_sgd([w3], lr, m, gaussian_noise_injection_std_dev=0.5) assert np.allclose(w1.value, w2.value) and np.allclose(w1.value, w3.value) for i in range(10): checkpoint = learner1.create_checkpoint() v = np.float32(np.random.rand(100,100)) learner1.update({w1: v}, 1) learner2.update({w2: v}, 1) assert not np.allclose(w1.value, w2.value) learner3.restore_from_checkpoint(checkpoint) learner3.update({w3: v}, 1) assert np.allclose(w1.value, w3.value)
def test_learner_logging(): from cntk import Trainer from cntk.logging import ProgressPrinter from cntk import cross_entropy_with_softmax, classification_error features = C.input_variable(shape=(1,), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1,), init=w_init) z = features * w labels = C.input_variable(shape=(1,), name='b') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) writer = TestProgressWriter(); lr_values = [0.3, 0.2, 0.1, 0] m_values = [0.6, 0.7, 0.8] learner = C.momentum_sgd(z.parameters, C.learning_parameter_schedule_per_sample(lr_values, epoch_size=1), C.momentum_schedule(m_values, epoch_size=1)) trainer = Trainer(z, (ce, errs), [learner], writer) for i in range(10): trainer.train_minibatch({features: [[2.]], labels: [[1.]]}) assert len(writer.log_output) == len(lr_values + m_values) values = [j for i in zip(lr_values,m_values) for j in i] + [0] for i in range(len(values)): assert (values[i] == writer.log_output[i])
def train_lm(testing=False): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) ; print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) last_avg_ce = 0 for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence : features, label_sequence : labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 last_avg_ce = av_ce num_trained_samples += token_count num_trained_samples_since_last_report += token_count if not testing: # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save(model_filename) print("Saved model to '%s'" % model_filename) return last_avg_ce
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers): # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0]*20 + [0.9983347214509387]*20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") local_learner = C.learners.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) if block_size != None: parameter_learner = C.train.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size) else: parameter_learner = C.train.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) # Create trainer return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)
def create_distributed_learner(self, mode, config): local_learner = C.sgd(self.z.parameters, C.learning_parameter_schedule_per_sample(0.01)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner( local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig( block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner( local_learner, block_momentum_as_time_constant=config. block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner
def create_trainer(self): try: p = self.output.parameters # Three of four parameters are learned by block_momentum_distributed_learner. bmd_learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd( [p[0], p[1], p[2]], cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) # New API to mark which learner is to use for metric aggregaion. bmd_learner.set_as_metric_aggregator() # The last parameter is learned by the data_parallel_distributed_learner. momentum_schedule = cntk.momentum_schedule_per_sample( 0.9990913221888589) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) dpd_learner = cntk.data_parallel_distributed_learner( cntk.momentum_sgd([p[3]], lr_per_sample, momentum_schedule, True)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [bmd_learner, dpd_learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3)) mbs = mb_source(tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps = 4) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep) ).train(device) #4 sweeps of 25 samples = 100 samples assert(t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert(writer.training_summary_counter == 2)
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model( device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3)) mbs = mb_source( tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps=4) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep)).train(device) #4 sweeps of 25 samples = 100 samples assert (t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert (writer.training_summary_counter == 2)
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd, ld]) features = C.sequence.input_variable(((2 * context + 1) * feature_dim)) labels = C.sequence.input_variable((num_classes)) model = Sequential( [For(range(3), lambda: Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error(z, labels) learner = C.fsadagrad( z.parameters, lr=C.learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), momentum=C.momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(freq=0) trainer = C.Trainer(z, (ce, errs), learner, progress_printer) input_map = { features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) assert True os.chdir(abs_path)
def adjust_lr_callback(index, average_error, cv_num_samples, cv_num_minibatches): global prev_metric if (prev_metric - average_error) / prev_metric < 0.05: # relative gain must reduce metric by at least 5% rel learner.reset_learning_rate(C.learning_parameter_schedule_per_sample(learner.learning_rate() / 2)) if learner.learning_rate() < lr_per_sample / (2**7-0.1): # we are done after the 6-th LR cut print("Learning rate {} too small. Training complete.".format(learner.learning_rate())) return False # means we are done print("Improvement of metric from {:.3f} to {:.3f} insufficient. Halving learning rate to {}.".format(prev_metric, average_error, learner.learning_rate())) prev_metric = average_error return True # means continue
def create_sample_model(device, writer=None, lr_per_sample=C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0])): in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10, device=device) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) learner = C.sgd(z.parameters, lr_per_sample) trainer = C.Trainer(z, (ce, errs), [learner], writer) return (trainer, in1, labels)
def create_learner(model): '''Create the optimized method''' optim = "momentum_sgd" lr = 0.001 lr_per_sample = C.learning_parameter_schedule_per_sample(lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if optim == 'momentum_sgd': clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True return C.momentum_sgd(model.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation)
def train_mse_cntk(x, y, model, train_gen, val_gen, epochs, val_steps): loss_function = cntk.squared_error(model, y) accuracy_function = loss_function learner = cntk.adam(model.parameters, cntk.learning_parameter_schedule_per_sample(0.001), cntk.learning_parameter_schedule_per_sample(0.9)) trainer = cntk.Trainer(model, (loss_function, accuracy_function), [learner]) evaluator = cntk.Evaluator(accuracy_function) history = fit_generator(x, y, model=model, trainer=trainer, evaluator=evaluator, train_gen=train_gen, steps_per_epoch=500, epochs=epochs, val_gen=val_gen, validation_steps=val_steps) plot_results(history)
def create_sample_model(device, writer=None, lr_per_sample=C.learning_parameter_schedule_per_sample( [0.3, 0.2, 0.1, 0.0])): in1 = sequence.input_variable(shape=(input_dim, )) labels = sequence.input_variable(shape=(input_dim, )) p = parameter(shape=(input_dim, ), init=10, device=device) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) learner = C.sgd(z.parameters, lr_per_sample) trainer = C.Trainer(z, (ce, errs), [learner], writer) return (trainer, in1, labels)
def use_glove_word_embeddings_cntk(preload_weights=False): tokenizer, x_train, y_train, x_val, y_val = from_raw_text_to_word_embeddings( ) x = cntk.input_variable(shape=(Constants.maxlen, ), dtype=np.float32) y = cntk.input_variable(shape=(1, ), dtype=np.float32) model = cntk.one_hot(x, num_classes=Constants.max_words, sparse_output=True) if preload_weights is True: embedding_matrix = compute_embedding_matrix(tokenizer) assert (Constants.embedding_dim == embedding_matrix.shape[0]) or (Constants.embedding_dim == embedding_matrix.shape[1]) model = cntk.layers.Embedding(weights=embedding_matrix)(model) else: model = cntk.layers.Embedding(Constants.embedding_dim)(model) model = cntk.layers.Dense(32, activation=cntk.relu)(model) model = cntk.layers.Dense(1, activation=cntk.sigmoid)(model) loss_function = cntk.binary_cross_entropy(model.output, y) round_predictions = cntk.round(model.output) equal_elements = cntk.equal(round_predictions, y) accuracy_function = cntk.reduce_mean(equal_elements, axis=0) max_epochs = 10 batch_size = 32 learner = cntk.adam(model.parameters, cntk.learning_parameter_schedule_per_sample(0.0001), cntk.learning_parameter_schedule_per_sample(0.99)) progress_printer = cntk.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = cntk.Trainer(model, (loss_function, accuracy_function), [learner], progress_printer) evaluator = cntk.Evaluator(accuracy_function) cntk_train(x, y, x_train, y_train, max_epochs, batch_size, trainer, evaluator)
def create_trainer(self): try: lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) learner = cntk.data_parallel_distributed_learner( cntk.sgd(self.output.parameters, lr_per_sample)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def create_trainer(use_sparse, device): a = C.sequence.input_variable(shape=input_shape, is_sparse=use_sparse, name='input') w = C.parameter(init=w_init, device=dev) z = times(a, w) l = C.sequence.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) trainer = C.Trainer( z, (loss, None), C.sgd(z.parameters, lr=C.learning_parameter_schedule_per_sample(0.7))) return (a, l, w, trainer)
def train_sequence_classifier(): input_dim = 2000 hidden_dim = 25 embedding_dim = 50 num_classes = 5 # Input variables denoting the features and label data features = C.sequence.input_variable(shape=input_dim, is_sparse=True) label = C.input_variable(num_classes) # Instantiate the sequence classification model classifier_output = lstm_sequence_classifier(features, num_classes, embedding_dim, hidden_dim) ce = C.cross_entropy_with_softmax(classifier_output, label) pe = C.classification_error(classifier_output, label) rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) reader = create_reader(path, True, input_dim, num_classes) input_map = { features: reader.streams.features, label: reader.streams.labels } lr_per_sample = C.learning_parameter_schedule_per_sample(0.1) # Instantiate the trainer object to drive the model training progress_printer = C.logging.ProgressPrinter(0) trainer = C.Trainer(classifier_output, (ce, pe), C.sgd(classifier_output.parameters, lr=lr_per_sample), progress_printer) # Get minibatches of sequences to train with and perform model training minibatch_size = 200 for i in range(251): mb = reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) evaluation_average = copy.copy( trainer.previous_minibatch_evaluation_average) loss_average = copy.copy(trainer.previous_minibatch_loss_average) return evaluation_average, loss_average
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( awesome_labels = StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd,ld]) features = C.sequence.input_variable(((2*context+1)*feature_dim)) labels = C.sequence.input_variable((num_classes)) model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error (z, labels) learner = C.fsadagrad(z.parameters, lr=C.learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), momentum=C.momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(freq=0) trainer = C.Trainer(z, (ce, errs), learner, progress_printer) input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) assert True os.chdir(abs_path)
def create_distributed_learner(self, mode, config): local_learner = C.sgd(self.z.parameters, C.learning_parameter_schedule_per_sample(0.01)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner(local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig(block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner(local_learner, block_momentum_as_time_constant=config.block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner
def test_usermbsource_training(tmpdir, with_checkpoint_impl): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) # Using this for testing the UserMinibatchSource checkpointing if with_checkpoint_impl: MBS_CV_CLASS = MyDataSourceWithCheckpoint else: MBS_CV_CLASS = MyDataSource mbs_cv = MBS_CV_CLASS(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_parameter_schedule_per_sample, sgd, Trainer, \ training_session, times feature = sequence.input_variable(shape=(input_dim,)) label = C.input_variable(shape=(num_output_classes,)) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) #having a large learning rate to prevent the model from converging earlier where not all the intended samples are fed #note that training session can end earlier if there is no updates lr_per_sample = learning_parameter_schedule_per_sample(0.3) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { feature: mbs.fsi, label: mbs.lsi } session = training_session( trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20, cv_config = C.CrossValidationConfig(minibatch_source=mbs_cv, max_samples=10, minibatch_size=2) ) session.train() assert trainer.total_number_of_samples_seen == 20 if with_checkpoint_impl: assert mbs_cv._restore_from_checkpoint_calls == 1
def test_restore_constants(tmpdir): C.device.try_set_default_device(C.device.cpu()) def _setvalue(x, v): x.value = 0 * x.value + v if len(x.shape) > 0 else np.array( v, dtype=np.float32) def _setall(f, v): for x in f.constants + f.parameters: _setvalue(x, v) def _checkall(f, v): for x in f.constants + f.parameters: assert (x.value == v).all() x = C.input_variable(10) f = C.layers.BatchNormalization()(x) trainer = C.Trainer( f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_parameter_schedule_per_sample(0.1))) model_filename = str(tmpdir / 'function.out') checkpoint_filename = str(tmpdir / 'checkpoint.out') _setall(f, 1) f.save(model_filename) _checkall(f, 1) _setall(f, 2) trainer.save_checkpoint(checkpoint_filename) _checkall(f, 2) _setall(f, 3) _checkall(f, 3) trainer.restore_from_checkpoint(checkpoint_filename) _checkall(f, 2) f2 = C.Function.load(model_filename) _checkall(f2, 1) _setall(f, 4) _checkall(f, 4) f.restore(model_filename) _checkall(f, 1) _setall(f2, 5) _checkall(f2, 5)
def create_trainer(): masked_dec = dec * C.ops.clip(C.ops.argmax(y), 0, 1) loss, label_error = criterion(masked_dec, y) loss *= C.ops.clip(C.ops.argmax(y), 0, 1) lr_schedule = C.learning_parameter_schedule_per_sample([1e-3] * 2 + [5e-4] * 2 + [1e-4], epoch_size=int(epoch_size)) momentum_as_time_constant = C.momentum_as_time_constant_schedule(1000) learner = C.adam(parameters=dec.parameters, lr=lr_schedule, momentum=momentum_as_time_constant, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs) trainer = C.Trainer(dec, (loss, label_error), learner, progress_printer) C.logging.log_number_of_parameters(dec) return trainer
def train_sequence_classifier(): input_dim = 2000 hidden_dim = 25 embedding_dim = 50 num_classes = 5 # Input variables denoting the features and label data features = C.sequence.input_variable(shape=input_dim, is_sparse=True) label = C.input_variable(num_classes) # Instantiate the sequence classification model classifier_output = lstm_sequence_classifier(features, num_classes, embedding_dim, hidden_dim) ce = C.cross_entropy_with_softmax(classifier_output, label) pe = C.classification_error(classifier_output, label) rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) reader = create_reader(path, True, input_dim, num_classes) input_map = { features : reader.streams.features, label : reader.streams.labels } lr_per_sample = C.learning_parameter_schedule_per_sample(0.1) # Instantiate the trainer object to drive the model training progress_printer = C.logging.ProgressPrinter(0) trainer = C.Trainer(classifier_output, (ce, pe), C.sgd(classifier_output.parameters, lr=lr_per_sample), progress_printer) # Get minibatches of sequences to train with and perform model training minibatch_size = 200 for i in range(251): mb = reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) evaluation_average = copy.copy(trainer.previous_minibatch_evaluation_average) loss_average = copy.copy(trainer.previous_minibatch_loss_average) return evaluation_average, loss_average
def adjust_lr_callback(index, average_error, cv_num_samples, cv_num_minibatches): global prev_metric if ( prev_metric - average_error ) / prev_metric < 0.05: # relative gain must reduce metric by at least 5% rel learner.reset_learning_rate( C.learning_parameter_schedule_per_sample(learner.learning_rate() / 2)) if learner.learning_rate() < lr_per_sample / ( 2**7 - 0.1): # we are done after the 6-th LR cut print("Learning rate {} too small. Training complete.".format( learner.learning_rate())) return False # means we are done print( "Improvement of metric from {:.3f} to {:.3f} insufficient. Halving learning rate to {}." .format(prev_metric, average_error, learner.learning_rate())) prev_metric = average_error return True # means continue
def test_restore_constants(tmpdir): C.device.try_set_default_device(C.device.cpu()) def _setvalue(x, v): x.value = 0 * x.value + v if len(x.shape)> 0 else np.array(v, dtype=np.float32) def _setall(f, v): for x in f.constants + f.parameters: _setvalue(x, v) def _checkall(f, v): for x in f.constants + f.parameters: assert (x.value == v).all() x = C.input_variable(10) f = C.layers.BatchNormalization()(x) trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_parameter_schedule_per_sample(0.1))) model_filename = str(tmpdir / 'function.out') checkpoint_filename = str(tmpdir / 'checkpoint.out') _setall(f, 1) f.save(model_filename) _checkall(f, 1) _setall(f, 2) trainer.save_checkpoint(checkpoint_filename) _checkall(f, 2) _setall(f, 3) _checkall(f, 3) trainer.restore_from_checkpoint(checkpoint_filename) _checkall(f, 2) f2 = C.Function.load(model_filename) _checkall(f2, 1) _setall(f, 4) _checkall(f, 4) f.restore(model_filename) _checkall(f, 1) _setall(f2, 5) _checkall(f2, 5)
def Loss(self): # Evaluating old actions and values : logprobs, state_value, dist_entropy = self.policy.evaluate() # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling) c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs') ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs)) c_rewards = C.input_variable(1, name='rewards') advantages = c_rewards - C.stop_gradient(state_value) # Finding Surrogate Loss: surr1 = ratios * advantages surr2 = C.clip(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -0.01 * dist_entropy actor_loss = C.reduce_mean(neglog_loss + entropy_loss) critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards)) loss = actor_loss + critic_loss chunk = { 'neglog_loss': neglog_loss, 'entropy_loss': entropy_loss, 'actor_loss': actor_loss, 'critic_loss': critic_loss } trainer = C.Trainer( loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(self.lr), C.momentum_schedule_per_sample(self.betas[0]), variance_momentum=C.momentum_schedule_per_sample( self.betas[1]))) # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate return loss, chunk, trainer
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers): # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size=epoch_size) mms = [0] * 20 + [0.9983347214509387] * 20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample( mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner if block_size != None and num_quantization_bits != 32: raise RuntimeError( "Block momentum cannot be used with quantization, please remove quantized_bits option." ) local_learner = C.learners.momentum_sgd( network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) if block_size != None: parameter_learner = C.train.distributed.block_momentum_distributed_learner( local_learner, block_size=block_size) else: parameter_learner = C.train.distributed.data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) # Create trainer return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)
def create_trainer(use_sparse, device): a = C.sequence.input_variable(shape=input_shape, is_sparse=use_sparse, name='input') w_i = C.parameter(init=w_init_i, device=dev) a_projection = times(a, w_i) p_o = C.placeholder() h = C.sequence.past_value(p_o) w_h = C.parameter(init=w_init_h, device=dev) h_projection = times(h, w_h) z = a_projection + h_projection z = z.replace_placeholder(z) z = reshape(z, label_shape) l = C.sequence.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) trainer = C.Trainer( z, (loss, None), C.sgd(z.parameters, lr=C.learning_parameter_schedule_per_sample(0.7))) return (a, l, w_i, w_h, trainer)
def create_trainer(self): try: lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) p = self.output.parameters # Three of four parameters are learned by first data_parallel_distributed_learner. learner1 = cntk.data_parallel_distributed_learner( cntk.sgd([p[0], p[1], p[2]], lr_per_sample)) # New API to mark which learner is to use for metric aggregaion. learner1.set_as_metric_aggregator() # The last parameter is learned by another data_parallel_distributed_learner. learner2 = cntk.data_parallel_distributed_learner( cntk.sgd([p[3]], lr_per_sample)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [learner1, learner2], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs=80): _cntk_py.set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # input normalization 1/256 = 0.00396025 scaled_input = C.element_times(C.constant(0.00390625), input_var) f = GlobalAveragePooling() f.update_signature((1, 8, 8)) with C.layers.default_options(): z = C.layers.Sequential([ C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 32, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 64, strides=(1, 1), pad=False), C.layers.MaxPooling((3, 3), strides=(2, 2), pad=True), C.layers.Dropout(0.5) ]), C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 128, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 160, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.MaxPooling((3, 3), strides=(2, 2), pad=True), C.layers.Dropout(0.5) ]), C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 192, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 256, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 10, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.AveragePooling((8, 8), strides=(1, 1), pad=False) ]) ])(scaled_input) print('z.shape', z.shape) z = C.flatten(z) print('z.shape now', z.shape) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters # learning rate lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size=epoch_size) # momentum mms = [0] * 20 + [0.9983347214509387] * 20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample( mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() # save model modelname = "NIN_test1.dnn" z.save(os.path.join(model_path, modelname)) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) data = reader_test.next_minibatch(current_minibatch, input_map=input_map) metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch sample_count += current_minibatch minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def train_and_test(reader_train, reader_test, model_func): ############################################### # Training the model ############################################### # Instantiate the input and the label variables input = C.input_variable(input_dim) label = C.input_variable(input_dim) # Create the model function model = model_func(input) # The labels for this network is same as the input MNIST image. # Note: Inside the model we are scaling the input to 0-1 range # Hence we rescale the label to the same range # We show how one can use their custom loss function # loss = -(y* log(p)+ (1-y) * log(1-p)) where p = model output and y = target # We have normalized the input between 0-1. Hence we scale the target to same range target = label / 255.0 loss = -(target * C.log(model) + (1 - target) * C.log(1 - model)) label_error = C.classification_error(model, target) # training config epoch_size = 30000 # 30000 samples is half the dataset size minibatch_size = 64 num_sweeps_to_train_with = 5 if isFast else 100 num_samples_per_sweep = 60000 num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) // minibatch_size # Instantiate the trainer object to drive the model training lr_per_sample = [0.00003] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size) # Momentum which is applied on every minibatch_size = 64 samples momentum_schedule = C.momentum_schedule(0.9126265014311797, minibatch_size) # We use a variant of the Adam optimizer which is known to work well on this dataset # Feel free to try other optimizers from # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner learner = C.fsadagrad(model.parameters, lr=lr_schedule, momentum=momentum_schedule) # Instantiate the trainer progress_printer = C.logging.ProgressPrinter(0) trainer = C.Trainer(model, (loss, label_error), learner, progress_printer) # Map the data streams to the input and labels. # Note: for autoencoders input == label input_map = { input: reader_train.streams.features, label: reader_train.streams.features } aggregate_metric = 0 for i in range(num_minibatches_to_train): # Read a mini batch from the training data file data = reader_train.next_minibatch(minibatch_size, input_map=input_map) # Run the trainer on and perform model training trainer.train_minibatch(data) samples = trainer.previous_minibatch_sample_count aggregate_metric += trainer.previous_minibatch_evaluation_average * samples train_error = (aggregate_metric * 100.0) / (trainer.total_number_of_samples_seen) print("Average training error: {0:0.2f}%".format(train_error)) ############################################################################# # Testing the model # Note: we use a test file reader to read data different from a training data ############################################################################# # Test data for trained model test_minibatch_size = 32 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size # Test error metric calculation metric_numer = 0 metric_denom = 0 test_input_map = { input: reader_test.streams.features, label: reader_test.streams.features } for i in range(0, int(num_minibatches_to_test)): # We are loading test data in batches specified by test_minibatch_size # Each data point in the minibatch is a MNIST digit image of 784 dimensions # with one pixel per dimension that we will encode / decode with the # trained model. data = reader_test.next_minibatch(test_minibatch_size, input_map=test_input_map) # Specify the mapping of input variables in the model to actual # minibatch data to be tested with eval_error = trainer.test_minibatch(data) # minibatch data to be trained with metric_numer += np.abs(eval_error * test_minibatch_size) metric_denom += test_minibatch_size # Average of evaluation errors of all test minibatches test_error = (metric_numer * 100.0) / (metric_denom) print("Average test error: {0:0.2f}%".format(test_error)) return model, train_error, test_error
G_fake = dcgan_generator(z) D_real = dcgan_discriminator(x_real) D_fake = D_real.clone(method="share", substitutions={x_real.output: G_fake.output}) # # loss function # G_loss = -C.log(D_fake) D_loss = -(C.log(D_real) + C.log(1.0 - D_fake)) # # optimizer # G_learner = C.adam(G_fake.parameters, lr=C.learning_parameter_schedule_per_sample(1e-4), momentum=0.5, gradient_clipping_threshold_per_sample=minibatch_size, gradient_clipping_with_truncation=True) D_learner = C.adam(D_real.parameters, lr=C.learning_parameter_schedule_per_sample(1e-4), momentum=0.5, gradient_clipping_threshold_per_sample=minibatch_size, gradient_clipping_with_truncation=True) G_progress_printer = C.logging.ProgressPrinter(tag="Generator") D_progress_printer = C.logging.ProgressPrinter(tag="Discriminator") if not os.path.exists("./dcgan_image"): os.mkdir("./dcgan_image") G_trainer = C.Trainer(G_fake, (G_loss, None), [G_learner],
def convnet_mnist(debug_output=False, epoch_size=60000, minibatch_size=64, max_epochs=40): image_height = 64 image_width = 64 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 4 # Input variables denoting the features and label data input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32) label_var = C.ops.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var) with C.layers.default_options(activation=C.ops.relu, pad=False): conv1 = C.layers.Convolution2D((5,5), 32, pad=True)(scaled_input) pool1 = C.layers.MaxPooling((3,3), (2,2))(conv1) conv2 = C.layers.Convolution2D((3,3), 48)(pool1) pool2 = C.layers.MaxPooling((3,3), (2,2))(conv2) conv3 = C.layers.Convolution2D((3,3), 64)(pool2) f4 = C.layers.Dense(96)(conv3) drop4 = C.layers.Dropout(0.5)(f4) z = C.layers.Dense(num_output_classes, activation=None)(drop4) ce = C.losses.cross_entropy_with_softmax(z, label_var) pe = C.metrics.classification_error(z, label_var) reader_train = create_reader(os.path.join(data_path, 'Data-train-15000_20180720_070615.txt'), True, input_dim, num_output_classes) # Set learning parameters lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0]*5 + [0.9990239141819757] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var : reader_train.streams.features, label_var : reader_train.streams.labels } C.logging.log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[label_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch))) # Load test data reader_test = create_reader(os.path.join(data_path, 'Data-test-5000_20180720_070615.txt'), False, input_dim, num_output_classes) input_map = { input_var : reader_test.streams.features, label_var : reader_test.streams.labels } # Test data for trained model epoch_size = 5000 minibatch_size = 250 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def convnet_mnist(debug_output=False, epoch_size=60000, minibatch_size=64, max_epochs=40): image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variables denoting the features and label data input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32) label_var = C.ops.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var) with C.layers.default_options(activation=C.ops.relu, pad=False): conv1 = C.layers.Convolution2D((5,5), 32, pad=True)(scaled_input) pool1 = C.layers.MaxPooling((3,3), (2,2))(conv1) conv2 = C.layers.Convolution2D((3,3), 48)(pool1) pool2 = C.layers.MaxPooling((3,3), (2,2))(conv2) conv3 = C.layers.Convolution2D((3,3), 64)(pool2) f4 = C.layers.Dense(96)(conv3) drop4 = C.layers.Dropout(0.5)(f4) z = C.layers.Dense(num_output_classes, activation=None)(drop4) ce = C.losses.cross_entropy_with_softmax(z, label_var) pe = C.metrics.classification_error(z, label_var) reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) # Set learning parameters lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0]*5 + [0.9990239141819757] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var : reader_train.streams.features, label_var : reader_train.streams.labels } C.logging.log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[label_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch))) # Load test data reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) input_map = { input_var : reader_test.streams.features, label_var : reader_test.streams.labels } # Test data for trained model epoch_size = 10000 minibatch_size = 1024 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs=80): _cntk_py.set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), input_var) with C.layers.default_options(activation=C.relu, pad=True): z = C.layers.Sequential([ C.layers.For( range(2), lambda: [ C.layers.Convolution2D((3, 3), 64), C.layers.Convolution2D((3, 3), 64), LocalResponseNormalization(1.0, 4, 0.001, 0.75), C.layers.MaxPooling((3, 3), (2, 2)) ]), C.layers.For( range(2), lambda i: [C.layers.Dense([256, 128][i]), C.layers.Dropout(0.5)]), C.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size=epoch_size) mms = [0] * 20 + [0.9983347214509387] * 20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample( mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() z.save( os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Replace 0 with 1 to get detailed log. set_computation_network_trace_level(0) # These values must match for both train and test reader. image_height = train_reader.height image_width = train_reader.width num_channels = train_reader.channel_count sequence_length = train_reader.sequence_length num_output_classes = train_reader.label_count # Input variables denoting the features and label data input_var = C.input_variable((num_channels, sequence_length, image_height, image_width), np.float32) label_var = C.input_variable(num_output_classes, np.float32) # Instantiate simple 3D Convolution network inspired by VGG network # and http://vlg.cs.dartmouth.edu/c3d/c3d_video.pdf with C.default_options (activation=C.relu): z = C.layers.Sequential([ C.layers.Convolution3D((3,3,3), 64, pad=True), C.layers.MaxPooling((1,2,2), (1,2,2)), C.layers.For(range(3), lambda i: [ C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.MaxPooling((2,2,2), (2,2,2)) ]), C.layers.For(range(2), lambda : [ C.layers.Dense(1024), C.layers.Dropout(0.5) ]), C.layers.Dense(num_output_classes, activation=None) ])(input_var) # loss and classification error. ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config train_epoch_size = train_reader.size() train_minibatch_size = 2 # Set learning parameters lr_per_sample = [0.01]*10+[0.001]*10+[0.0001] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=train_epoch_size) momentum_per_sample = 0.9997558891748972 mm_schedule = C.momentum_schedule_per_sample([momentum_per_sample]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs train_reader.reset() while train_reader.has_more(): videos, labels, current_minibatch = train_reader.next_minibatch(train_minibatch_size) trainer.train_minibatch({input_var : videos, label_var : labels}) trainer.summarize_training_progress() # Test data for trained model epoch_size = test_reader.size() test_minibatch_size = 2 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 test_reader.reset() while test_reader.has_more(): videos, labels, current_minibatch = test_reader.next_minibatch(test_minibatch_size) # minibatch data to be trained with metric_numer += trainer.test_minibatch({input_var : videos, label_var : labels}) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def test_learner_empy_parameters_list(): lr_per_sample = C.learning_parameter_schedule_per_sample(0.1) with pytest.raises(ValueError): learner = C.sgd([], lr_per_sample)
def test_sweep_based_schedule(tmpdir, device_id): from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence from cntk import Trainer input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False) in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) lr_per_sample = C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0]) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { in1 : mbs.streams.features, labels : mbs.streams.labels } # fetch minibatch (first sequence) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.3 # fetch minibatch (second sequence, sweep ends at this point) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.2 # fetch minibatch (both sequences -- entire sweep in one go) data = mbs.next_minibatch(9, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.1 # fetch minibatch (multiple sweeps) data = mbs.next_minibatch(30, input_map=input_map) trainer.train_minibatch(data, outputs=[z.output]) assert learner.learning_rate() == 0.0
def train_and_test(reader_train, reader_test, model_func): ############################### # Training the model ############################### input = C.input_variable(input_dim) label = C.input_variable(input_dim) model = model_func(input) target = label / 255.0 loss = -(target * C.log(model) + (1 - target) * C.log(1 - model)) label_error = C.classification_error(model, target) epoch_size = 30000 minibatch_size = 64 num_sweeps_to_train_with = 5 if isFast else 100 num_samples_per_sweep = 60000 num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) // minibatch_size lr_per_sample = [3e-4] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size) momentum_schedule = C.momentum_schedule(0.9126265014311797, minibatch_size) learner = C.fsadagrad(model.parameters, lr=lr_schedule, momentum=momentum_schedule) progress_printer = C.logging.ProgressPrinter(0) trainer = C.Trainer(model, (loss, label_error), learner, progress_printer) input_map = { input: reader_train.streams.features, label: reader_train.streams.features } aggregate_metric = 0 for i in range(num_minibatches_to_train): data = reader_train.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) samples = trainer.previous_minibatch_sample_count aggregate_metric += trainer.previous_minibatch_evaluation_average * samples train_error = (aggregate_metric * 100) / (trainer.total_number_of_samples_seen) print("Average training error: {0:0.2f}%".format(train_error)) ############################################################################# # Testing the model # Note: we use a test file reader to read data different from a training data ############################################################################# test_minibatch_size = 32 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0 # Test error metric calculation metric_numer = 0 metric_denom = 0 test_input_map = { input: reader_test.streams.features, label: reader_test.streams.features } for i in range(0, int(num_minibatches_to_test)): data = reader_test.next_minibatch(test_minibatch_size, input_map=test_input_map) eval_error = trainer.test_minibatch(data) metric_numer += np.abs(eval_error * test_minibatch_size) metric_denom += test_minibatch_size test_error = (metric_numer * 100) / (metric_denom) print("Average test error: {0:0.2f}%".format(test_error)) return model, train_error, test_error
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs = 80): _cntk_py.set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), input_var) with C.layers.default_options (activation=C.relu, pad=True): z = C.layers.Sequential([ C.layers.For(range(2), lambda : [ C.layers.Convolution2D((3,3), 64), C.layers.Convolution2D((3,3), 64), LocalResponseNormalization (1.0, 4, 0.001, 0.75), C.layers.MaxPooling((3,3), (2,2)) ]), C.layers.For(range(2), lambda i: [ C.layers.Dense([256,128][i]), C.layers.Dropout(0.5) ]), C.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0]*20 + [0.9983347214509387]*20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain = True, l2_regularization_weight = l2_reg_weight) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) ; print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
results = re.findall("Completed successfully.", str_out) if len(results) != 2: print(str_out) assert False if __name__=='__main__': in1 = C.input_variable(shape=1) labels = C.input_variable(shape=1) p1 = parameter(shape=1) p2 = parameter(shape=1) n = plus(in1, p1, name='n') z = plus(n, p2, name='z') ce = squared_error(z, labels) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) lr_per_sample = C.learning_parameter_schedule_per_sample(0.007) dist_learners = [ C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_schedule, True)), C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_schedule, True)) ] trainer = C.Trainer(z, ce, dist_learners) in1_value = [[1]] label_value = [[0]] arguments = {in1: in1_value, labels: label_value} z_output = z.output def check_samples(learners, expected_number_of_samples): for learner in learners: if learner.total_number_of_samples_seen != expected_number_of_samples: print("Completed with exception.")
def convnet_mnist(max_epochs, output_dir, data_dir, debug_output=False, epoch_size=60000, minibatch_size=64): """Creates and trains a feedforward classification model for MNIST images.""" image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variables denoting the features and label data input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32) label_var = C.ops.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var) with C.layers.default_options(activation=C.ops.relu, pad=False): conv1 = C.layers.Convolution2D((5, 5), 32, pad=True)(scaled_input) pool1 = C.layers.MaxPooling((3, 3), (2, 2))(conv1) conv2 = C.layers.Convolution2D((3, 3), 48)(pool1) pool2 = C.layers.MaxPooling((3, 3), (2, 2))(conv2) conv3 = C.layers.Convolution2D((3, 3), 64)(pool2) f4 = C.layers.Dense(96)(conv3) drop4 = C.layers.Dropout(0.5)(f4) z = C.layers.Dense(num_output_classes, activation=None)(drop4) ce = C.losses.cross_entropy_with_softmax(z, label_var) pe = C.metrics.classification_error(z, label_var) # Load train data reader_train = create_reader(os.path.join(data_dir, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes, max_epochs * epoch_size) # Load test data reader_test = create_reader(os.path.join(data_dir, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes, C.io.FULL_DATA_SWEEP) # Set learning parameters lr_per_sample = [0.001] * 10 + [0.0005] * 10 + [0.0001] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0] * 5 + [0.9990239141819757] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) # Instantiate the trainer object to drive the model training local_learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) progress_printer = C.logging.ProgressPrinter( tag='Training', rank=C.train.distributed.Communicator.rank(), num_epochs=max_epochs, ) learner = C.train.distributed.data_parallel_distributed_learner(local_learner) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map_train = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } input_map_test = { input_var: reader_test.streams.features, label_var: reader_test.streams.labels } C.logging.log_number_of_parameters(z) print() C.train.training_session( trainer=trainer, mb_source=reader_train, model_inputs_to_streams=input_map_train, mb_size=minibatch_size, progress_frequency=epoch_size, checkpoint_config=CheckpointConfig(frequency=epoch_size, filename=os.path.join(output_dir, "ConvNet_MNIST")), test_config=TestConfig(reader_test, minibatch_size=minibatch_size, model_inputs_to_streams=input_map_test) ).train() return
def test_lattice_deserializer(device_id): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') try_set_default_device(cntk_device(device_id)) data_dir = '' if 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY' in os.environ: data_dir = os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'] else: print('CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY environment variable is not defined') print(data_dir) data_dir = os.path.join(data_dir, "Speech", "AN4Corpus", "v0") os.chdir(data_dir) feature_dimension = 33 feature = C.sequence.input_variable(feature_dimension) label_dimension = 133 label = C.sequence.input_variable(label_dimension) axis_lattice = C.Axis.new_unique_dynamic_axis('lattice_axis') lattice = C.sequence.input_variable(1, sequence_axis=axis_lattice) train_feature_filepath = os.path.join(data_dir,"glob_0000.scp") train_label_filepath = os.path.join(data_dir,"glob_0000.mlf") train_lattice_index_path = os.path.join(data_dir,"latticeIndex.txt") mapping_filepath = os.path.join(data_dir,"state.list") train_feature_stream = C.io.HTKFeatureDeserializer( C.io.StreamDefs(speech_feature = C.io.StreamDef(shape = feature_dimension, scp = train_feature_filepath))) train_label_stream = C.io.HTKMLFDeserializer( mapping_filepath, C.io.StreamDefs(speech_label = C.io.StreamDef(shape = label_dimension, mlf = train_label_filepath)), True) train_lattice_stream = C.io.LatticeDeserializer(train_lattice_index_path,C.io.StreamDefs(speech_lattice = C.io.StreamDef())) train_data_reader = C.io.MinibatchSource([train_feature_stream, train_label_stream, train_lattice_stream], frame_mode = False) train_input_map = {feature: train_data_reader.streams.speech_feature, label: train_data_reader.streams.speech_label, lattice: train_data_reader.streams.speech_lattice} feature_mean = np.fromfile(os.path.join("GlobalStats", "mean.363"), dtype=float, count=feature_dimension) feature_inverse_stddev = np.fromfile(os.path.join("GlobalStats", "var.363"), dtype=float, count=feature_dimension) feature_normalized = (feature - feature_mean) * feature_inverse_stddev with C.default_options(activation=C.sigmoid): z = C.layers.Sequential([ C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(1024))), C.layers.Dense(label_dimension) ])(feature_normalized) mbsize = 1024 mbs_per_epoch = 10 max_epochs = 2 symListPath = os.path.join(data_dir,"CY2SCH010061231_1369712653.numden.lats.symlist") phonePath = os.path.join(data_dir,"model.overalltying") stateListPath = os.path.join(data_dir,"state.list") transProbPath = os.path.join(data_dir,"model.transprob") criteria = C.lattice_sequence_with_softmax(label, z, z, lattice, symListPath, phonePath, stateListPath, transProbPath) err = C.classification_error(label,z) lr = C.learning_parameter_schedule_per_sample([(3, .01), (1,.001)]) mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], mbsize) learner = C.momentum_sgd(z.parameters, lr, mm) trainer = C.Trainer(z, (criteria, err), learner) C.logging.log_number_of_parameters(z) progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs = max_epochs) for epoch in range(max_epochs): for mb in range(mbs_per_epoch): minibatch = train_data_reader.next_minibatch(mbsize, input_map = train_input_map) trainer.train_minibatch(minibatch) progress_printer.update_with_trainer(trainer, with_metric = True) progress_printer.epoch_summary(with_metric = True) assert np.allclose(trainer.previous_minibatch_evaluation_average, 0.15064, atol=TOLERANCE_ABSOLUTE) assert np.allclose(trainer.previous_minibatch_loss_average, 0.035923, atol=TOLERANCE_ABSOLUTE) assert (trainer.previous_minibatch_sample_count == 218) assert (trainer.total_number_of_samples_seen == 5750) print("Completed successfully.")
def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_size): model_train = create_model_train(s2smodel) criterion = create_criterion_function(model_train) # also wire in a greedy decoder so that we can properly log progress on a validation example # This is not used for the actual training process. model_greedy = create_model_test(s2smodel) # Instantiate the trainer object to drive the model training minibatch_size = 72 lr = 0.001 if use_attention else 0.005 learner = C.fsadagrad(model_train.parameters, #apply the learning rate as if it is a minibatch of size 1 lr = C.learning_parameter_schedule_per_sample([lr]*2+[lr/2]*3+[lr/4], epoch_size), momentum = C.momentum_schedule(0.9366416204111472, minibatch_size=minibatch_size), gradient_clipping_threshold_per_sample=2.3, gradient_clipping_with_truncation=True) trainer = C.Trainer(None, criterion, learner) # records total_samples = 0 mbs = 0 eval_freq = 100 # print out some useful training information C.logging.log_number_of_parameters(model_train) ; print() progress_printer = C.logging.ProgressPrinter(freq=30, tag='Training') # a hack to allow us to print sparse vectors sparse_to_dense = create_sparse_to_dense(input_vocab_dim) for epoch in range(max_epochs): while total_samples < (epoch+1) * epoch_size: # get next minibatch of training data mb_train = train_reader.next_minibatch(minibatch_size) # do the training trainer.train_minibatch({criterion.arguments[0]: mb_train[train_reader.streams.features], criterion.arguments[1]: mb_train[train_reader.streams.labels]}) progress_printer.update_with_trainer(trainer, with_metric=True) # log progress # every N MBs evaluate on a test sequence to visually show how we're doing if mbs % eval_freq == 0: mb_valid = valid_reader.next_minibatch(1) # run an eval on the decoder output model (i.e. don't use the groundtruth) e = model_greedy(mb_valid[valid_reader.streams.features]) print(format_sequences(sparse_to_dense(mb_valid[valid_reader.streams.features]), i2w)) print("->") print(format_sequences(e, i2w)) # visualizing attention window if use_attention: debug_attention(model_greedy, mb_valid[valid_reader.streams.features]) total_samples += mb_train[train_reader.streams.labels].num_samples mbs += 1 # log a summary of the stats for the epoch progress_printer.epoch_summary(with_metric=True) # done: save the final model model_path = "model_%d.cmf" % epoch print("Saving final model to '%s'" % model_path) s2smodel.save(model_path) print("%d epochs complete." % max_epochs)