def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), progress_printer=progress_printer, model_inputs_to_mb_source_mapping=input_map, checkpoint_frequency=epoch_size, checkpoint_filename="ResNet_CIFAR10_DataAug", progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) if profiling: start_profiler(sync_gpu=True) training_session.train() if profiling: stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, model_inputs_to_mb_source_mapping=input_map, mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), progress_printer=progress_printer, # checkpoint_frequency = epoch_size, checkpoint_filename=os.path.join(model_path, model_name), # save_all_checkpoints = True, progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore=restore) # Train all minibatches training_session.train()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, model_inputs_to_mb_source_mapping = input_map, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_printer, # checkpoint_frequency = epoch_size, checkpoint_filename = os.path.join(model_path, model_name), # save_all_checkpoints = True, progress_frequency = epoch_size, cv_source = test_source, cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore = restore) # Train all minibatches training_session.train()
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_printer, model_inputs_to_mb_source_mapping = input_map, checkpoint_frequency = epoch_size, checkpoint_filename="ResNet_CIFAR10_DataAug", progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) if profiling: start_profiler(sync_gpu=True) training_session.train() if profiling: stop_profiler()
def train_and_test(network, trainer, train_source, test_source, progress_writers, minibatch_size, epoch_size, restore, profiling=False): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source = train_source, trainer = trainer, model_inputs_to_mb_source_mapping = input_map, mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), progress_printer = progress_writers, checkpoint_frequency = epoch_size, checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), # save_all_checkpoints = False, progress_frequency=epoch_size, cv_source = test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size), # cv_frequency = epoch_size, restore=restore) # Train all minibatches if profiling: cntk.start_profiler(sync_gpu=True) training_session.train() if profiling: cntk.stop_profiler()
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model(device, writer, lr_per_sample=C.learning_parameter_schedule_per_sample(0.3)) mbs = mb_source(tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps = 4) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep) ).train(device) #4 sweeps of 25 samples = 100 samples assert(t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert(writer.training_summary_counter == 2)
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model( device, writer, lr_per_sample=C.learning_rate_schedule(0.3, C.UnitType.sample)) mbs = mb_source( tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps=4) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep)).train(device) #4 sweeps of 25 samples = 100 samples assert (t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert (writer.training_summary_counter == 2)
def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( training_minibatch_source=train_source, trainer=trainer, model_inputs_to_mb_source_mapping=input_map, mb_size_schedule=cntk.minibatch_size_schedule(64), progress_printer=progress_printer, checkpoint_filename=os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), progress_frequency=epoch_size, cv_source=test_source, cv_mb_size_schedule=cntk.minibatch_size_schedule(16), restore=False) # Train all minibatches training_session.train()
def test_session_progress_print(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session(trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=10).train(device) assert (writer.training_summary_counter == 6)
def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session(train_source, trainer, cntk.minibatch_size_schedule(64), progress_printer, input_map, "ConvNet_CIFAR10_DataAug_", epoch_size) training_session.train() ### TODO: Stay tuned for an upcoming simpler EvalSession API for test/validation. ### Evaluation action minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 while True: data = test_source.next_minibatch(minibatch_size, input_map=input_map) if not data: break local_mb_samples = data[network['label']].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom) progress_printer.end_progress_print(fin_msg) print("") print(fin_msg) print("") return metric_numer / metric_denom
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=(5, C.train.DataUnit.minibatch)).train(device) #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written: assert (writer.training_summary_counter == 3)
def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size) # Train all minibatches training_session( trainer=trainer, mb_source = train_source, model_inputs_to_streams = input_map, mb_size_schedule = mb_size_schedule, progress_frequency=epoch_size, checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore), test_config = TestConfig(source=test_source, mb_size=mb_size_schedule) ).train()
def test_session_progress_print(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=10 #by default, fequence is on samples ).train(device) assert(writer.training_summary_counter == 6)
def test_session_progress_print_on_minibatch_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() t, feature, label = create_sample_model(device, writer) mbs = mb_source(tmpdir, "training", max_samples=INFINITELY_REPEAT) input_map = { feature: mbs.streams.features, label: mbs.streams.labels } test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(4), model_inputs_to_streams=input_map, max_samples=60, progress_frequency=(5, C.train.DataUnit.minibatch) ).train(device) #mb size = 4; num_of_mb = 60/4 = 15; output every 5 mb; at the end, 3 outputs are written: assert(writer.training_summary_counter == 3)
def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } training_session = cntk.training_session( train_source, trainer, cntk.minibatch_size_schedule(minibatch_size), progress_printer, input_map, os.path.join(model_path, "AlexNet_"), epoch_size) training_session.train() # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 while True: data = test_source.next_minibatch(minibatch_size, input_map=input_map) if not data: break local_mb_samples = data[network['label']].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 fin_msg = "Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom) progress_printer.end_progress_print(fin_msg) print("") print(fin_msg) print("") return metric_numer / metric_denom
def simple_mnist(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 # Input variables denoting the features and label data input = input_variable(input_dim, np.float32) label = input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), input) z = fully_connected_classifier_net(scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST") path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { input: reader_train.streams.features, label: reader_train.streams.labels } lr_per_minibatch = learning_rate_schedule(0.2, UnitType.minibatch) # Instantiate the trainer object to drive the model training trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch)) # Get minibatches of images to train with and perform model training minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 #training_progress_output_freq = 100 progress_printer = ProgressPrinter( #freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with) session = training_session( training_minibatch_source=reader_train, trainer=trainer, mb_size_schedule=minibatch_size_schedule(minibatch_size), progress_printer=progress_printer, model_inputs_to_mb_source_mapping=input_map, progress_frequency=num_samples_per_sweep, max_training_samples=num_samples_per_sweep * num_sweeps_to_train_with) session.train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { input: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
# Configure distributed training. # For this, we wrap the learner in a distributed_learner object. # This specific example implements the BlockMomentum method. The Python script must be run # using mpiexec in order to have effect. For example, under Windows, the command is: # mpiexec -n 4 -lines python -u MNIST_Complex_Training.py learner = C.train.distributed.data_parallel_distributed_learner(learner) # For distributed training, we must maximize the minibatch size, as to minimize # communication cost and GPU underutilization. Hence, we use a "schedule" # that increases the minibatch size after a few epochs. By specifying the learning rate # as UnitType.sample, the contribution per sample maintains the same scale without # having to fix up the learning rate. # For this MNIST model, larger minibatch sizes make it faster, because the # model is too small to utilize a full GPU. Hence data-parallel training cannot # be expected to lead to speed-ups. minibatch_size_schedule = C.minibatch_size_schedule([256]*6 + [512]*9 + [1024]*7 + [2048]*8 + [4096], epoch_size=epoch_size) # Train and test, with checkpointing and learning-rate adjustment. progress = criterion.train((X_train, Y_train), minibatch_size=minibatch_size_schedule, max_epochs=50, parameter_learners=[learner], callbacks=[progress_writer, checkpoint_callback_config, cv_callback_config, test_callback_config]) # Get progress statistics. final_loss = progress.epoch_summaries[-1].loss final_metric = progress.epoch_summaries[-1].metric final_samples = progress.epoch_summaries[-1].samples test_metric = progress.test_summary.metric # Inspect predictions on one minibatch, for illustration. # For evaluation, we map the output of the network between 0-1 and convert them into probabilities # for the two classes. We use a softmax function to get the probabilities of each of the class.
# For this, we wrap the learner in a distributed_learner object. # This specific example implements the BlockMomentum method. The Python script must be run # using mpiexec in order to have effect. For example, under Windows, the command is: # mpiexec -n 4 -lines python -u MNIST_Complex_Training.py learner = C.train.distributed.data_parallel_distributed_learner(learner) # For distributed training, we must maximize the minibatch size, as to minimize # communication cost and GPU underutilization. Hence, we use a "schedule" # that increases the minibatch size after a few epochs. By specifying the learning rate # as per sample, the contribution per sample maintains the same scale without # having to fix up the learning rate. # For this MNIST model, larger minibatch sizes make it faster, because the # model is too small to utilize a full GPU. Hence data-parallel training cannot # be expected to lead to speed-ups. minibatch_size_schedule = C.minibatch_size_schedule( [256] * 6 + [512] * 9 + [1024] * 7 + [2048] * 8 + [4096], epoch_size=epoch_size) # Train and test, with checkpointing and learning-rate adjustment. progress = criterion.train((X_train, Y_train), minibatch_size=minibatch_size_schedule, max_epochs=50, parameter_learners=[learner], callbacks=[ progress_writer, checkpoint_callback_config, cv_callback_config, test_callback_config ]) # Get progress statistics. final_loss = progress.epoch_summaries[-1].loss final_metric = progress.epoch_summaries[-1].metric