def test_noise_injection_with_checkpointing(): from cntk import initializer shape = (100,100) w1 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w2 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w3 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) lr=learning_rate_schedule(0.5, UnitType.sample) m=C.momentum_schedule(0.99) learner1 = C.momentum_sgd([w1], lr, m, gaussian_noise_injection_std_dev=0.5) learner2 = C.momentum_sgd([w2], lr, m, gaussian_noise_injection_std_dev=0.5) learner3 = C.momentum_sgd([w3], lr, m, gaussian_noise_injection_std_dev=0.5) assert np.allclose(w1.value, w2.value) and np.allclose(w1.value, w3.value) for i in range(10): checkpoint = learner1.create_checkpoint() v = np.float32(np.random.rand(100,100)) learner1.update({w1: v}, 1) learner2.update({w2: v}, 1) assert not np.allclose(w1.value, w2.value) learner3.restore_from_checkpoint(checkpoint) learner3.update({w3: v}, 1) assert np.allclose(w1.value, w3.value)
def create_trainer(self): try: p = self.output.parameters # Three of four parameters are learned by block_momentum_distributed_learner. bmd_learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd( [p[0], p[1], p[2]], cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) # New API to mark which learner is to use for metric aggregaion. bmd_learner.set_as_metric_aggregator() # The last parameter is learned by the data_parallel_distributed_learner. momentum_schedule = cntk.momentum_schedule_per_sample( 0.9990913221888589) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) dpd_learner = cntk.data_parallel_distributed_learner( cntk.momentum_sgd([p[3]], lr_per_sample, momentum_schedule, True)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [bmd_learner, dpd_learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def test_trainer(tmpdir, no_eval_function): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]} trainer.save_checkpoint(p, external_state) restored_state = trainer.restore_from_checkpoint(p) assert external_state == restored_state assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], C.Learner)
def test_learner_logging(): from cntk import Trainer from cntk.logging import ProgressPrinter from cntk import cross_entropy_with_softmax, classification_error features = C.input_variable(shape=(1,), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1,), init=w_init) z = features * w labels = C.input_variable(shape=(1,), name='b') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) writer = TestProgressWriter(); lr_values = [0.3, 0.2, 0.1, 0] m_values = [0.6, 0.7, 0.8] learner = C.momentum_sgd(z.parameters, learning_rate_schedule(lr_values, UnitType.sample, 1), C.momentum_schedule(m_values, 1)) trainer = Trainer(z, (ce, errs), [learner], writer) for i in range(10): trainer.train_minibatch({features: [[2.]], labels: [[1.]]}) assert len(writer.log_output) == len(lr_values + m_values) values = [j for i in zip(lr_values,m_values) for j in i] + [0] for i in range(len(values)): assert (values[i] == writer.log_output[i])
def create_network(para, verbose=False): with cntk.layers.default_options(init=cntk.glorot_uniform(), activation=cntk.ops.relu): # In order to accelerate the debugging step, we choose a simple structure with only 2 parameters h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[0], strides=(1, 1), pad=True, name='C1')(network_input / 255.0) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2), )(h) h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[1], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2))(h) h = cntk.layers.Convolution2D(filter_shape=(3, 3), num_filters=para[2], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.Dense(para[3])(h) h = cntk.layers.Dropout(0.25)(h) z = cntk.layers.Dense(10, activation=None, name='R')(h) loss = cntk.cross_entropy_with_softmax(z, network_label) label_error = cntk.classification_error(z, network_label) lr_schedule = cntk.learning_rate_schedule(0.1, cntk.UnitType.minibatch) learner = cntk.momentum_sgd(z.parameters, lr_schedule, cntk.momentum_schedule(0.9)) trainer = cntk.Trainer(z, (loss, label_error), [learner]) if verbose: log = cntk.logging.ProgressPrinter(100) for _ in xrange(20000): data = train_reader.next_minibatch(100, input_map=mapping(train_reader)) trainer.train_minibatch(data) if verbose: log.update_with_trainer(trainer) return trainer
def init_model(m): progress_writers = [ cntk.logging.ProgressPrinter( freq=int(BATCHSIZE / 2), rank=cntk.train.distributed.Communicator.rank(), num_epochs=EPOCHS) ] # Loss (dense labels); check if support for sparse labels loss = cntk.cross_entropy_with_softmax(m, labels) # Momentum SGD # https://github.com/Microsoft/CNTK/blob/master/Manual/Manual_How_to_use_learners.ipynb # unit_gain=False: momentum_direction = momentum*old_momentum_direction + gradient # if unit_gain=True then ...(1-momentum)*gradient local_learner = cntk.momentum_sgd( m.parameters, lr=cntk.learning_rate_schedule(LR, cntk.UnitType.minibatch), momentum=cntk.momentum_schedule(MOMENTUM), unit_gain=False) distributed_learner = cntk.train.distributed.data_parallel_distributed_learner( local_learner) trainer = cntk.Trainer(m, (loss, cntk.classification_error(m, labels)), [distributed_learner], progress_writers) return trainer, distributed_learner
def __init__(self, n_in, n_out, init_lr, momentum): self.param1 = 512 self.param2 = 256 self.n_in = int(n_in) self.n_out = int(n_out) self.input = C.sequence.input_variable(shape=(self.n_in,)) self.label = C.sequence.input_variable(shape=(self.n_out,)) self.three_dnn = C.layers.Sequential([ C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_1'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_2'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_3')]) self.final_dnn = C.layers.Dense(self.n_out, name='dnn_final') self.dnn_1 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_1') self.dnn_2 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_2') self.dnn_3 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_3') self.dnn_4 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_4') self.list_bias = [] for i in xrange(16): self.list_bias.append(C.parameter(shape=(self.param2, ), name='bias_' + str(i))) self.output = self.model(self.input) self.loss = loss_fun(self.output, self.label) self.eval_err = loss_fun(self.output, self.label) self.lr_s = C.learning_rate_schedule(init_lr, C.UnitType.sample) self.mom_s = C.momentum_schedule(momentum) self.learner = C.momentum_sgd(self.output.parameters, lr=self.lr_s, momentum=self.mom_s) self.trainer = C.Trainer(self.output, (self.loss, self.eval_err), [self.learner])
def create_trainer(self): try: learner = cntk.block_momentum_distributed_learner(cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer(self.output, (self.ce, self.err), [learner], [cntk.logging.ProgressPrinter(freq=progress_freq, tag="Training", rank=comm_rank)]) except RuntimeError: self.trainer = None return
def fineTuneModel(folder_with_data,path_to_label_csv="label.csv", original_model_path="../vgg13.model",max_epochs=10): trainingValues = getData(folder_with_data,path_to_label_csv) input_var =ct.input((1,height,width),np.float32) label_var = ct.input((num_classes), np.float32) print("cloning old model") z = clone_model(original_model_path,input_var) loss = ct.cross_entropy_with_softmax(z, label_var) metric = ct.classification_error(z, label_var) minibatch_size = 32 epoch_size = trainingValues.getLengthOfData() lr_per_minibatch = [learning_rate]*10+[learning_rate/2.0] mm_time_constant = -minibatch_size/np.log(0.9) lr_schedule = ct.learning_rate_schedule(lr_per_minibatch, unit=ct.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = ct.momentum_as_time_constant_schedule(mm_time_constant) learner = ct.momentum_sgd(z.parameters, lr_schedule, mm_schedule) trainer = ct.Trainer(z, (loss, metric), learner) print("created trainer and learner") print("training started") while epoch < max_epochs : trainingValues.reset() # Training start_time = time.time() training_loss = 0 training_accuracy = 0 #mini-batch learning while trainingValues.hasMoreMinibatches(): #while there is data for a mini batch: x,y,currBatchSize = trainingValues.getNextMinibatch(minibatch_size) # x - images y - labels/emotions trainer.train_minibatch({ input_var : x, label_var: y}) #maintain stats: training_loss += trainer.previous_minibatch_loss_average * currBatchSize training_accuracy += trainer.previous_minibatch_evaluation_average * currBatchSize training_accuracy /= trainingValues.getLengthOfData() training_accuracy = 1.0 - training_accuracy print("Epoch took:", time.time() - start_time, "seconds") print("training accuracy:\t\t{:.2f}%".format(training_accuracy*100)) epoch +=1 #SAVE MODEL z.save("../vgg13.model")
def finalize_network(reader, model_details, max_amount_of_epochs, samples_per_epoch, samples_per_minibatch, pixel_dimensions, classes, learning_rate): features = input_variable(shape=(pixel_dimensions['depth'], pixel_dimensions['height'], pixel_dimensions['width'])) label = input_variable(shape=len(classes)) # speeds up training normalized_features = element_times(1.0 / 256.0, features) model = create_tf_model(model_details, num_classes=len(classes), input_features=normalized_features, freeze=True) loss = cross_entropy_with_softmax(model, label) metric = classification_error(model, label) learner = momentum_sgd(parameters=model.parameters, lr=learning_rate_schedule(learning_rate, UnitType.minibatch), momentum=0.9, l2_regularization_weight=0.0005) reporter = ProgressPrinter(tag='training', num_epochs=max_amount_of_epochs) trainer = Trainer(model=model, criterion=(loss, metric), parameter_learners=[learner], progress_writers=[reporter]) log_number_of_parameters(model) map_input_to_streams_train = { features: reader.streams.features, label: reader.streams.labels } training_session(trainer=trainer, mb_source=reader, model_inputs_to_streams=map_input_to_streams_train, mb_size=samples_per_minibatch, progress_frequency=samples_per_epoch, checkpoint_config=CheckpointConfig( frequency=samples_per_epoch, filename=os.path.join("./checkpoints", "ConvNet_Lego_VisiOn"), restore=True)).train() network = {'features': features, 'label': label, 'model': softmax(model)} model_name = f"CNN-3200-224-resnet-18.model" export_path = os.path.abspath( os.path.join("..", "..", "Final models", "CNN", model_name)) model.save(export_path) return network
def create_learner(model): '''Create the optimized method''' optim = "momentum_sgd" lr = 0.001 lr_per_sample = C.learning_parameter_schedule_per_sample(lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if optim == 'momentum_sgd': clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True return C.momentum_sgd(model.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation)
def __init__(self, dim_x, dim_y): self.dim_x = int(dim_x) self.dim_y = int(dim_y) self.input = cntk.sequence.input_variable(shape=(self.dim_x, )) self.label = cntk.sequence.input_variable(shape=(self.dim_y, )) self.output = self.model(self.input) self.loss = loss_fun(self.output, self.label) self.eval = loss_fun(self.output, self.label) self.learner = cntk.momentum_sgd(parameters=self.output.parameters, momentum=cntk.momentum_schedule(0.5), lr=cntk.learning_rate_schedule(0.006, cntk.UnitType.sample)) self.trainer = cntk.Trainer(self.output, (self.loss, self.eval), [self.learner])
def modelInit(self): #create output model folder: self.output_model_folder = os.path.join(self.base_folder, R'models') if not os.path.exists(self.output_model_folder): os.makedirs(self.output_model_folder) self.model = VGG13(self.num_classes) self.input_var = ct.input( (1, self.model.input_height, self.model.input_width), np.float32) self.label_var = ct.input((self.num_classes), np.float32) print("initialized model") self.genData() #ct.input_variables takes the no. of dimensions. and automatically creates #1-hot encoded. ct.input doesn't. #criterian of model: loss, metric: #loss = cross_entropy_with_softmax #metric = classification error self.z = self.model.model(self.input_var) loss = ct.cross_entropy_with_softmax(self.z, self.label_var) metric = ct.classification_error(self.z, self.label_var) """ pred = ct.softmax(z) loss = ct.negate(ct.reduce_sum(ct.element_times(label_var, ct.log(pred)), axis=-1)) """ minibatch_size = 32 epoch_size = self.trainingValues.getLengthOfData() #THROW MOMENTUM: lr_per_minibatch = [self.model.learning_rate ] * 20 + [self.model.learning_rate / 2.0] * 20 + [ self.model.learning_rate / 10.0 ] #use eta for 20 minibatches, then half of eta for other 20 batches then eta/10 for remaining minimaches mm_time_constant = -minibatch_size / np.log(0.9) lr_schedule = ct.learning_rate_schedule(lr_per_minibatch, unit=ct.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = ct.momentum_as_time_constant_schedule(mm_time_constant) # construct the trainer #learner performs model updates. can be adam() or sgd() learner = ct.momentum_sgd(self.z.parameters, lr_schedule, mm_schedule) # The Trainer optimizes the loss by SGD, and logs the metric self.trainer = ct.Trainer(self.z, (loss, metric), learner) print("created trainer and learner")
def create_trainer(self): learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ])
def test_output_to_retain(): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1], [2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) assert np.allclose(var_map[z_output], np.asarray(in1_value)+20)
def test_ext_train(tmpdir): dim = 4 p = C.parameter(shape=(dim, ), init=10) i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m = MyPlus(i, C.constant(3), 'my_plus') # keeping m unwrapped since we need to access its member variables z = C.user_function(m) + p momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size=1) trainer = C.Trainer(z, (z + 0, z + 0), [ C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True, minibatch_size=0) ]) i = 0 while i < 100: i += 1 input_data = np.random.rand(dim) trainer.train_minibatch([input_data]) assert m.forward_calls == m.backward_calls == 100 filepath = str(tmpdir / 'test_ext_train.dat') z.save(filepath) buf = open(filepath, 'rb').read() # this is only need for Python 2.7 # (which does not distinguish between bytes and strings) if isinstance(buf, str): buf = bytearray(buf) z1 = Function.load(buf) m1 = z1.find_by_name('my_plus') # m1 is an instance of UserFunction, cannot directly downcast it to MyPlus, # using serialize as workaround: state = m1.serialize()['state'] assert state['forward_calls'] == state['backward_calls'] == 100
def test_ext_lambdafunc(tmpdir): dim = 4 class CallbackCounter(object): def __init__(self): self.count = 0 def inc(self, arg): self.count += 1 cb = CallbackCounter() p = C.parameter(shape=(dim,), init=1) i = C.input_variable(dim, needs_gradient=True, name='i_var') k = i * p m = LambdaFunc(k, when=lambda arg: np.sum(arg) > 1, execute=cb.inc) m = C.user_function(m) z0 = m + 0 filepath = str(tmpdir / 'test_ext_lambdafunc.dat') z0.save(filepath) Function.register_udf_deserialize_callback('conditional_exec_lambda', lambda x, *unused: LambdaFunc(x, when=lambda arg: np.sum(arg) > 1, execute=cb.inc)) z = Function.load(filepath) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size = 1) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 input_data = 0.1 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 0 input_data = 0.3 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 1
def test_ext_lambdafunc(tmpdir): dim = 4 class CallbackCounter(object): def __init__(self): self.count = 0 def inc(self, arg): self.count += 1 cb = CallbackCounter() p = C.parameter(shape=(dim,), init=1) i = C.input_variable(dim, needs_gradient=True, name='i_var') k = i * p m = LambdaFunc(k, when=lambda arg: np.sum(arg) > 1, execute=cb.inc) m = C.user_function(m) z0 = m + 0 filepath = str(tmpdir / 'test_ext_lambdafunc.dat') z0.save(filepath) Function.register_udf_deserialize_callback('conditional_exec_lambda', lambda x, *unused: LambdaFunc(x, when=lambda arg: np.sum(arg) > 1, execute=cb.inc)) z = Function.load(filepath) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 input_data = 0.1 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 0 input_data = 0.3 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 1
def run_distributed_training(tmpdir, create_func): in1 = sequence.input_variable(shape=1) labels = sequence.input_variable(shape=1) p = parameter(shape=2, init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) dist_learner = create_func( C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)) communicator = dist_learner.communicator() workers = communicator.workers() current_worker = communicator.current_worker() found_rank = False for wk in workers: if current_worker.global_rank == wk.global_rank: found_rank = True assert found_rank trainer = C.Trainer(z, (ce, errs), [dist_learner]) in1_value = [[1], [2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) communicator.barrier() assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__
def test_ext_train(tmpdir): dim = 4 p = C.parameter(shape=(dim,), init=10) i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m = MyPlus(i, C.constant(3), 'my_plus') # keeping m unwrapped since we need to access its member variables z = C.user_function(m) + p momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 while i < 100: i += 1 input_data = np.random.rand(dim) trainer.train_minibatch([input_data]) assert m.forward_calls == m.backward_calls == 100 filepath = str(tmpdir / 'test_ext_train.dat') z.save(filepath) buf = open(filepath, 'rb').read() # this is only need for Python 2.7 # (which does not distinguish between bytes and strings) if isinstance(buf, str): buf = bytearray(buf) z1 = Function.load(buf) m1 = z1.find_by_name('my_plus') # m1 is an instance of UserFunction, cannot directly downcast it to MyPlus, # using serialize as workaround: state = m1.serialize()['state'] assert state['forward_calls'] == state['backward_calls'] == 100
def run_distributed_training(tmpdir, create_func): in1 = sequence.input_variable(shape=1) labels = sequence.input_variable(shape=1) p = parameter(shape=2, init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) dist_learner = create_func(C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)) communicator = dist_learner.communicator() workers = communicator.workers() current_worker = communicator.current_worker() found_rank = False for wk in workers: if current_worker.global_rank == wk.global_rank: found_rank = True assert found_rank trainer = C.Trainer(z, (ce, errs), [ dist_learner ]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) communicator.barrier() assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__
def create_trainer(network, minibatch_size, epoch_size, progress_printer): """ Create trainer """ # Set learning parameters lr_per_sample = [0.0015625] * 10 + [0.00046875] * 10 + [0.00015625] momentum_time_constant = [0] * 20 + [-minibatch_size / np.log(0.9)] l2_reg_weight = 0.002 lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)
def __init__(self, n_in, n_out, init_lr, momentum): self.param1 = 512 self.param2 = 256 self.n_in = int(n_in) self.n_out = int(n_out) self.input = C.sequence.input_variable(shape=(self.n_in, )) self.label = C.sequence.input_variable(shape=(self.n_out, )) self.three_dnn = Sequential([ Dense(self.param1, activation=C.tanh), Dense(self.param1, activation=C.tanh), Dense(self.param1, activation=C.tanh) ]) self.rnn_layer1 = Sequential([(Recurrence(LSTM(self.param2)), Recurrence(LSTM(self.param2), go_backwards=True)), C.splice]) self.rnn_layer2 = Sequential([(Recurrence(LSTM(self.param2)), Recurrence(LSTM(self.param2), go_backwards=True)), C.splice]) self.final_dnn = Dense(self.n_out) self.output = self.model(self.input) self.loss = loss_fun(self.output, self.label) self.eval_err = loss_fun(self.output, self.label) self.lr_s = C.learning_rate_schedule(init_lr, C.UnitType.sample) self.mom_s = C.momentum_schedule(momentum) self.learner = C.momentum_sgd(self.output.parameters, lr=self.lr_s, momentum=self.mom_s) self.trainer = C.Trainer(self.output, (self.loss, self.eval_err), [self.learner])
def main(base_folder, training_mode='majority', model_name='VGG13', max_epochs = 100): # create needed folders. output_model_path = os.path.join(base_folder, R'models') output_model_folder = os.path.join(output_model_path, model_name + '_' + training_mode) if not os.path.exists(output_model_folder): os.makedirs(output_model_folder) # creating logging file logging.basicConfig(filename = os.path.join(output_model_folder, "train.log"), filemode = 'w', level = logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) logging.info("Starting with training mode {} using {} model and max epochs {}.".format(training_mode, model_name, max_epochs)) # create the model num_classes = len(emotion_table) model = build_model(num_classes, model_name) # set the input variables. input_var = ct.input((1, model.input_height, model.input_width), np.float32) label_var = ct.input((num_classes), np.float32) # read FER+ dataset. logging.info("Loading data...") train_params = FERPlusParameters(num_classes, model.input_height, model.input_width, training_mode, False) test_and_val_params = FERPlusParameters(num_classes, model.input_height, model.input_width, "majority", True) train_data_reader = FERPlusReader.create(base_folder, train_folders, "label.csv", train_params) val_data_reader = FERPlusReader.create(base_folder, valid_folders, "label.csv", test_and_val_params) test_data_reader = FERPlusReader.create(base_folder, test_folders, "label.csv", test_and_val_params) # print summary of the data. display_summary(train_data_reader, val_data_reader, test_data_reader) # get the probalistic output of the model. z = model.model((input_var - 127.5)/127.5) pred = ct.softmax(z) epoch_size = train_data_reader.size() minibatch_size = 32 # Training config lr_per_minibatch = [model.learning_rate]*20 + [model.learning_rate / 2.0]*20 + [model.learning_rate / 10.0] mm_time_constant = -minibatch_size/np.log(0.9) lr_schedule = ct.learning_rate_schedule(lr_per_minibatch, unit=ct.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = ct.momentum_as_time_constant_schedule(mm_time_constant) # loss and error cost train_loss = cost_func(training_mode, pred, label_var) pe = ct.classification_error(z, label_var) # construct the trainer learner = ct.momentum_sgd(z.parameters, lr_schedule, mm_schedule) trainer = ct.Trainer(z, (train_loss, pe), learner) # Get minibatches of images to train with and perform model training max_val_accuracy = 0.0 final_test_accuracy = 0.0 best_test_accuracy = 0.0 logging.info("Start training...") epoch = 0 best_epoch = 0 while epoch < max_epochs: train_data_reader.reset() val_data_reader.reset() test_data_reader.reset() # Training start_time = time.time() training_loss = 0 training_accuracy = 0 while train_data_reader.has_more(): images, labels, current_batch_size = train_data_reader.next_minibatch(minibatch_size) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({input_var : images, label_var : labels}) # keep track of statistics. training_loss += trainer.previous_minibatch_loss_average * current_batch_size training_accuracy += trainer.previous_minibatch_evaluation_average * current_batch_size training_accuracy /= train_data_reader.size() training_accuracy = 1.0 - training_accuracy # Validation val_accuracy = 0 while val_data_reader.has_more(): images, labels, current_batch_size = val_data_reader.next_minibatch(minibatch_size) val_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size val_accuracy /= val_data_reader.size() val_accuracy = 1.0 - val_accuracy # if validation accuracy goes higher, we compute test accuracy test_run = False if val_accuracy > max_val_accuracy: best_epoch = epoch max_val_accuracy = val_accuracy trainer.save_checkpoint(os.path.join(output_model_folder, "model_{}".format(best_epoch))) test_run = True test_accuracy = 0 while test_data_reader.has_more(): images, labels, current_batch_size = test_data_reader.next_minibatch(minibatch_size) test_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size test_accuracy /= test_data_reader.size() test_accuracy = 1.0 - test_accuracy final_test_accuracy = test_accuracy if final_test_accuracy > best_test_accuracy: best_test_accuracy = final_test_accuracy logging.info("Epoch {}: took {:.3f}s".format(epoch, time.time() - start_time)) logging.info(" training loss:\t{:e}".format(training_loss)) logging.info(" training accuracy:\t\t{:.2f} %".format(training_accuracy * 100)) logging.info(" validation accuracy:\t\t{:.2f} %".format(val_accuracy * 100)) if test_run: logging.info(" test accuracy:\t\t{:.2f} %".format(test_accuracy * 100)) epoch += 1 logging.info("") logging.info("Best validation accuracy:\t\t{:.2f} %, epoch {}".format(max_val_accuracy * 100, best_epoch)) logging.info("Test accuracy corresponding to best validation:\t\t{:.2f} %".format(final_test_accuracy * 100)) logging.info("Best test accuracy:\t\t{:.2f} %".format(best_test_accuracy * 100)) pred.save('ferplus.onnx', ct.ModelFormat.ONNX)
def train(train_x, train_y, seed, model_dir, loss_dir): input_dim = 600 output_dim = 3631 num_epochs = 100 hidden_layer_type = ['TANH', 'TANH'] hidden_layer_size = [1024, 1024] momentum = 0.9 finetune_lr = 0.01 l2_regularization_weight = 0.00001 C.cntk_py.set_fixed_random_seed(seed) print('Creating DNN model...') input = C.input_variable(input_dim) output = C.input_variable(output_dim) dnn_model = create_dnn_model(input, hidden_layer_type, hidden_layer_size, output_dim) epoch_num = 0 current_finetune_lr = finetune_lr current_momentum = momentum train_loss_output = [] print('Learning...') while (epoch_num < num_epochs): print('started epoch %i' % epoch_num) epoch_num += 1 sub_start_time = time.time() lr_schedule = C.learning_rate_schedule(current_finetune_lr, C.UnitType.minibatch) momentum_schedule = C.momentum_schedule(current_momentum) learner = C.momentum_sgd( dnn_model.parameters, lr_schedule, momentum_schedule, unit_gain=False, l1_regularization_weight=0, l2_regularization_weight=l2_regularization_weight) #learner = C.adadelta(dnn_model.parameters, lr_schedule, rho=0.95, epsilon=1e-8, l1_regularization_weight=0, # l2_regularization_weight= 0.00001 ) loss = C.cross_entropy_with_softmax(dnn_model, output) error = loss trainer = C.Trainer(dnn_model, (loss, error), [learner]) train_error = [] for i in range(len(train_x)): temp_train_x = np.float32(train_x[i]) temp_train_y = np.float32(train_y[i]) trainer.train_minibatch({ input: temp_train_x, output: temp_train_y }) train_error.append(trainer.previous_minibatch_loss_average) this_train_loss = np.mean(train_error) sub_end_time = time.time() print('time for 1 epoch is %.1f' % (sub_end_time - sub_start_time)) train_loss_output.append(this_train_loss) print('loss is %.4f' % this_train_loss) if np.remainder(epoch_num, 10) == 0: nnets_file_name = 'dnn_model_ep' + np.str(epoch_num) + '.model' if not os.path.isdir(model_dir): os.makedirs(model_dir) dnn_model.save(os.path.join(model_dir, nnets_file_name)) if not os.path.isdir(loss_dir): os.makedirs(loss_dir) np.savetxt( os.path.join(loss_dir, 'loss_curve_ep' + np.str(epoch_num) + '.csv'), train_loss_output) nnets_file_name = 'dnn_model_final.model' if not os.path.isdir(model_dir): os.makedirs(model_dir) dnn_model.save(os.path.join(model_dir, nnets_file_name)) if not os.path.isdir(loss_dir): os.makedirs(loss_dir) np.savetxt( os.path.join(loss_dir, 'loss_curve_final' + np.str(epoch_num) + '.csv'), train_loss_output)
def test_lattice_deserializer(device_id): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') try_set_default_device(cntk_device(device_id)) data_dir = '' if 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY' in os.environ: data_dir = os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'] else: print('CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY environment variable is not defined') print(data_dir) data_dir = os.path.join(data_dir, "Speech", "AN4Corpus", "v0") os.chdir(data_dir) feature_dimension = 33 feature = C.sequence.input_variable(feature_dimension) label_dimension = 133 label = C.sequence.input_variable(label_dimension) axis_lattice = C.Axis.new_unique_dynamic_axis('lattice_axis') lattice = C.sequence.input_variable(1, sequence_axis=axis_lattice) train_feature_filepath = os.path.join(data_dir,"glob_0000.scp") train_label_filepath = os.path.join(data_dir,"glob_0000.mlf") train_lattice_index_path = os.path.join(data_dir,"latticeIndex.txt") mapping_filepath = os.path.join(data_dir,"state.list") train_feature_stream = C.io.HTKFeatureDeserializer( C.io.StreamDefs(speech_feature = C.io.StreamDef(shape = feature_dimension, scp = train_feature_filepath))) train_label_stream = C.io.HTKMLFDeserializer( mapping_filepath, C.io.StreamDefs(speech_label = C.io.StreamDef(shape = label_dimension, mlf = train_label_filepath)), True) train_lattice_stream = C.io.LatticeDeserializer(train_lattice_index_path,C.io.StreamDefs(speech_lattice = C.io.StreamDef())) train_data_reader = C.io.MinibatchSource([train_feature_stream, train_label_stream, train_lattice_stream], frame_mode = False) train_input_map = {feature: train_data_reader.streams.speech_feature, label: train_data_reader.streams.speech_label, lattice: train_data_reader.streams.speech_lattice} feature_mean = np.fromfile(os.path.join("GlobalStats", "mean.363"), dtype=float, count=feature_dimension) feature_inverse_stddev = np.fromfile(os.path.join("GlobalStats", "var.363"), dtype=float, count=feature_dimension) feature_normalized = (feature - feature_mean) * feature_inverse_stddev with C.default_options(activation=C.sigmoid): z = C.layers.Sequential([ C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(1024))), C.layers.Dense(label_dimension) ])(feature_normalized) mbsize = 1024 mbs_per_epoch = 10 max_epochs = 2 symListPath = os.path.join(data_dir,"CY2SCH010061231_1369712653.numden.lats.symlist") phonePath = os.path.join(data_dir,"model.overalltying") stateListPath = os.path.join(data_dir,"state.list") transProbPath = os.path.join(data_dir,"model.transprob") criteria = C.lattice_sequence_with_softmax(label, z, z, lattice, symListPath, phonePath, stateListPath, transProbPath) err = C.classification_error(label,z) lr = C.learning_parameter_schedule_per_sample([(3, .01), (1,.001)]) mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], mbsize) learner = C.momentum_sgd(z.parameters, lr, mm) trainer = C.Trainer(z, (criteria, err), learner) C.logging.log_number_of_parameters(z) progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs = max_epochs) for epoch in range(max_epochs): for mb in range(mbs_per_epoch): minibatch = train_data_reader.next_minibatch(mbsize, input_map = train_input_map) trainer.train_minibatch(minibatch) progress_printer.update_with_trainer(trainer, with_metric = True) progress_printer.epoch_summary(with_metric = True) assert np.allclose(trainer.previous_minibatch_evaluation_average, 0.15064, atol=TOLERANCE_ABSOLUTE) assert np.allclose(trainer.previous_minibatch_loss_average, 0.035923, atol=TOLERANCE_ABSOLUTE) assert (trainer.previous_minibatch_sample_count == 218) assert (trainer.total_number_of_samples_seen == 5750) print("Completed successfully.")
def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Replace 0 with 1 to get detailed log. set_computation_network_trace_level(0) # These values must match for both train and test reader. image_height = train_reader.height image_width = train_reader.width num_channels = train_reader.channel_count sequence_length = train_reader.sequence_length num_output_classes = train_reader.label_count # Input variables denoting the features and label data input_var = C.input_variable((num_channels, sequence_length, image_height, image_width), np.float32) label_var = C.input_variable(num_output_classes, np.float32) # Instantiate simple 3D Convolution network inspired by VGG network # and http://vlg.cs.dartmouth.edu/c3d/c3d_video.pdf with C.default_options (activation=C.relu): z = C.layers.Sequential([ C.layers.Convolution3D((3,3,3), 64, pad=True), C.layers.MaxPooling((1,2,2), (1,2,2)), C.layers.For(range(3), lambda i: [ C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.MaxPooling((2,2,2), (2,2,2)) ]), C.layers.For(range(2), lambda : [ C.layers.Dense(1024), C.layers.Dropout(0.5) ]), C.layers.Dense(num_output_classes, activation=None) ])(input_var) # loss and classification error. ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config train_epoch_size = train_reader.size() train_minibatch_size = 2 # Set learning parameters lr_per_sample = [0.01]*10+[0.001]*10+[0.0001] lr_schedule = C.learning_rate_schedule(lr_per_sample, epoch_size=train_epoch_size, unit=C.UnitType.sample) momentum_time_constant = 4096 mm_schedule = C.momentum_as_time_constant_schedule([momentum_time_constant]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs train_reader.reset() while train_reader.has_more(): videos, labels, current_minibatch = train_reader.next_minibatch(train_minibatch_size) trainer.train_minibatch({input_var : videos, label_var : labels}) trainer.summarize_training_progress() # Test data for trained model epoch_size = test_reader.size() test_minibatch_size = 2 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 test_reader.reset() while test_reader.has_more(): videos, labels, current_minibatch = test_reader.next_minibatch(test_minibatch_size) # minibatch data to be trained with metric_numer += trainer.test_minibatch({input_var : videos, label_var : labels}) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.momentum_sgd(params, lr=learning_rate_schedule( 1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)) ] @pytest.mark.parametrize("params, expectation", LR_SCHEDULE_PARAMS) def test_learning_rate_schedule(params, expectation): l = learning_rate_schedule(*params) assert [l[i] for i in range(len(expectation))] == expectation def sweep_based_schedule_fails(): with pytest.raises(Exception): learning_rate_schedule([1], unit=UnitType.sample, epoch_size=0)
def train_and_evaluate(reader_train, reader_test, max_epochs, model_func): # Input variables denoting the features and label data input_var = input_variable((num_channels, image_height, image_width)) label_var = input_variable((num_classes)) # Normalize the input feature_scale = 1.0 / 256.0 input_var_norm = element_times(feature_scale, input_var) # apply model to input z = model_func(input_var_norm, out_dims=num_classes) # # Training action # # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) # training config epoch_size = 20000 minibatch_size = 64 # Set training parameters lr_per_minibatch = learning_rate_schedule([0.01]*10 + [0.003]*10 + [0.001], UnitType.minibatch, epoch_size) momentum_time_constant = momentum_as_time_constant_schedule(-minibatch_size/np.log(0.9)) l2_reg_weight = 0.001 # trainer object progress_printer = ProgressPrinter(0) learner = momentum_sgd(z.parameters, lr = lr_per_minibatch, momentum = momentum_time_constant, l2_regularization_weight=l2_reg_weight) trainer = Trainer(z, (ce, pe), [learner], [progress_printer]) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } log_number_of_parameters(z) ; print() #progress_printer = ProgressPrinter(tag='Training') # perform model training stop_run=False batch_index = 0 plot_data = {'batchindex':[], 'loss':[], 'error':[]} for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[label_var].num_samples # count samples processed so far # For visualization... plot_data['batchindex'].append(batch_index) plot_data['loss'].append(trainer.previous_minibatch_loss_average) plot_data['error'].append(trainer.previous_minibatch_evaluation_average) progress_printer.update_with_trainer(trainer, with_metric=True) # log progress batch_index += 1 if trainer.previous_minibatch_evaluation_average < 0.025: stop_run=True break if stop_run: break progress_printer.epoch_summary(with_metric=True) #trainer.save_checkpoint(model_temp_file) # # Evaluation action # epoch_size = 6600 minibatch_size = 32 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 input_map = { input_var: reader_test.streams.features, label_var: reader_test.streams.labels } while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.1f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") # Visualize training result: window_width = 32 loss_cumsum = np.cumsum(np.insert(plot_data['loss'], 0, 0)) error_cumsum = np.cumsum(np.insert(plot_data['error'], 0, 0)) # Moving average. plot_data['batchindex'] = np.insert(plot_data['batchindex'], 0, 0)[window_width:] plot_data['avg_loss'] = (loss_cumsum[window_width:] - loss_cumsum[:-window_width]) / window_width plot_data['avg_error'] = (error_cumsum[window_width:] - error_cumsum[:-window_width]) / window_width plt.figure(1) plt.subplot(211) plt.plot(plot_data["batchindex"], plot_data["avg_loss"], 'b--') plt.xlabel('Minibatch number') plt.ylabel('Loss') plt.title('Minibatch run vs. Training loss ') plt.show() plt.subplot(212) plt.plot(plot_data["batchindex"], plot_data["avg_error"], 'r--') plt.xlabel('Minibatch number') plt.ylabel('Label Prediction Error') plt.title('Minibatch run vs. Label Prediction Error ') plt.show() return softmax(z)
print(str_out) assert False if __name__=='__main__': in1 = C.input_variable(shape=1) labels = C.input_variable(shape=1) p1 = parameter(shape=1) p2 = parameter(shape=1) n = plus(in1, p1, name='n') z = plus(n, p2, name='z') ce = squared_error(z, labels) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) lr_per_sample = C.learning_parameter_schedule_per_sample(0.007) dist_learners = [ C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_schedule, True)), C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_schedule, True)) ] trainer = C.Trainer(z, ce, dist_learners) in1_value = [[1]] label_value = [[0]] arguments = {in1: in1_value, labels: label_value} z_output = z.output def check_samples(learners, expected_number_of_samples): for learner in learners: if learner.total_number_of_samples_seen != expected_number_of_samples: print("Completed with exception.") raise ValueError("%d samples expected, got %d" % (expected_number_of_samples, learner.total_number_of_samples_seen))
def __train_cntk(self, path_to_folder: str, model_definition, epochs: int, output_model_path: str, classes, minibatch_size: int): import cntk from cntk.learners import learning_parameter_schedule from cntk.ops import input_variable from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef, MinibatchData, UserDeserializer import cntk.io.transforms as xforms from cntk.layers import default_options, Dense, Sequential, Activation, Embedding, Convolution2D, MaxPooling, Stabilizer, Convolution, Dropout, BatchNormalization from cntk.ops.functions import CloneMethod from cntk.logging import ProgressPrinter from cntk.losses import cross_entropy_with_softmax from cntk import classification_error, softmax, relu, ModelFormat, element_times, momentum_schedule, momentum_sgd import pandas as pd path_to_folder = path_to_folder.rstrip('/') map_file_train = path_to_folder + "/train_map.txt" map_file_test = path_to_folder + "/test_map.txt" classes_set = set() num_train = 0 num_test = 0 num_channels = 3 class TrackDataset(UserDeserializer): def __init__(self, map_file, streams, chunksize=100): super(TrackDataset, self).__init__() self._batch_size = chunksize self.dataframes = pd.read_csv(map_file, sep='\t', dtype=str, header=None, names=["features", "labels"]) self._streams = [ cntk.io.StreamInformation(s['name'], i, 'dense', np.float32, s['shape']) for i, s in enumerate(streams) ] self._num_chunks = int( math.ceil(len(self.dataframes) / chunksize)) def _scale_image(self, image, width=224, height=168): try: return image.resize((width, height), Image.LINEAR) except: raise Exception('scale_image error') def stream_infos(self): return self._streams def num_chunks(self): return self._num_chunks def get_chunk(self, chunk_id): images = [] labels = [] maximum = (chunk_id + 1) * self._batch_size if (maximum > len(self.dataframes)): maximum = len(self.dataframes) for i in range(chunk_id * self._batch_size, maximum): img_name = self.dataframes.iloc[i, 0] image = Image.open(img_name) cl = self.dataframes.iloc[i, 1:].values[0] image = self._scale_image(image) image = np.moveaxis((np.array(image).astype('float32')), -1, 0) image -= np.mean(image, keepdims=True) image /= (np.std(image, keepdims=True) + 1e-6) images.append(image) yv = np.zeros(num_classes) yv[classes.index(cl)] = 1 labels.append(yv) result = {} features = np.array(images) lab = np.array(labels).astype('float32') result[self._streams[0].m_name] = features result[self._streams[1].m_name] = lab return result try: with open(map_file_train) as f: csv_reader = csv.reader(f, delimiter='\t') for row in csv_reader: cmd = row[1] classes_set.add(cmd) num_train = num_train + 1 except Exception as e: raise Exception( "No train_map.txt file found in path " + path_to_folder + ". Did you create a dataset using create_balanced_dataset()?") num_classes = len(classes) with open(map_file_test) as f: for num_test, l in enumerate(f): pass # transforms = [ # xforms.scale(width=self.__image_width, height=self.__image_height, channels=num_channels, interpolations='linear'), # xforms.mean(mean_file) # ] dataset_train = TrackDataset(map_file=map_file_train, streams=[ dict(name='features', shape=(num_channels, self.__image_height, self.__image_width)), dict(name='labels', shape=(num_classes, )) ]) reader_train = MinibatchSource([dataset_train], randomize=True) # a = dataset_train.num_chunks() dataset_test = TrackDataset(map_file=map_file_test, streams=[ dict(name='features', shape=(num_channels, self.__image_height, self.__image_width)), dict(name='labels', shape=(num_classes, )) ]) reader_test = MinibatchSource([dataset_test], randomize=True) # ImageDeserializer loads images in the BGR format, not RGB # reader_train = MinibatchSource(ImageDeserializer(map_file_train, StreamDefs( # features = StreamDef(field='image', transforms=transforms), # labels = StreamDef(field='label', shape=num_classes) # ))) # reader_test = MinibatchSource(ImageDeserializer(map_file_test, StreamDefs( # features = StreamDef(field='image', transforms=transforms), # labels = StreamDef(field='label', shape=num_classes) # ))) # mb = reader_train.next_minibatch(10) input_var = input_variable( (num_channels, self.__image_height, self.__image_width)) label_var = input_variable((num_classes)) model = model_definition(input_var) ce = cross_entropy_with_softmax(model, label_var) pe = classification_error(model, label_var) epoch_size = num_train lr_per_minibatch = learning_parameter_schedule([0.01] * 10 + [0.003] * 10 + [0.001], epoch_size=epoch_size) momentums = momentum_schedule(0.9, minibatch_size=minibatch_size) l2_reg_weight = 0.001 learner = momentum_sgd(model.parameters, lr=lr_per_minibatch, momentum=momentums, l2_regularization_weight=l2_reg_weight) progress_printer = ProgressPrinter(tag='Training', num_epochs=epochs) trainer = cntk.train.Trainer(model, (ce, pe), [learner], [progress_printer]) input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } print("Training started") batch_index = 0 plot_data = {'batchindex': [], 'loss': [], 'error': []} for epoch in range(epochs): sample_count = 0 while sample_count < epoch_size: data: MinibatchSource = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) trainer.train_minibatch(data) sample_count += data[label_var].num_samples batch_index += 1 plot_data['batchindex'].append(batch_index) plot_data['loss'].append( trainer.previous_minibatch_loss_average) plot_data['error'].append( trainer.previous_minibatch_evaluation_average) trainer.summarize_training_progress() metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 epoch_size = num_test while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) data = reader_test.next_minibatch(current_minibatch, input_map=input_map) metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.1f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") model.save(output_model_path, format=ModelFormat.ONNX)
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w #test new API: learning_parameter_schedule #explicitly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size = 1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1]*3 +[0.2]*2 +[0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3,0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1, epoch_size = 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
print(str_out) assert False if __name__=='__main__': in1 = C.input_variable(shape=1) labels = C.input_variable(shape=1) p1 = parameter(shape=1) p2 = parameter(shape=1) n = plus(in1, p1, name='n') z = plus(n, p2, name='z') ce = squared_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) dist_learners = [ C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_time_constant, True)), C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_time_constant, True)) ] trainer = C.Trainer(z, ce, dist_learners) in1_value = [[1]] label_value = [[0]] arguments = {in1: in1_value, labels: label_value} z_output = z.output def check_samples(learners, expected_number_of_samples): for learner in learners: if learner.total_number_of_samples_seen != expected_number_of_samples: print("Completed with exception.") raise ValueError("%d samples expected, got %d" % (expected_number_of_samples, learner.total_number_of_samples_seen))
def train(reader_train, reader_test, samples_per_epoch, max_amount_of_epochs, samples_per_minibatch, dimensions, classes, learning_rate, output_directory, with_tf): features = input_variable(shape=(dimensions['depth'], dimensions['height'], dimensions['width'])) label = input_variable(shape=len(classes)) # speeds up training normalized_features = element_times(1.0 / 256.0, features) if with_tf: base_model = { 'model_file': os.path.join("..", "..", "Pretrained Models/ResNet_18.model"), 'feature_node_name': 'features', 'last_hidden_node_name': 'z.x', 'image_dims': (3, 224, 224) } model = create_tf_model(base_model, num_classes=len(classes), input_features=normalized_features, freeze=True) else: model = create_model(feature_dimensions=normalized_features, classes=classes) loss = cross_entropy_with_softmax(model, label) metric = classification_error(model, label) learner = momentum_sgd(parameters=model.parameters, lr=learning_rate_schedule(learning_rate, UnitType.minibatch), momentum=0.9, l2_regularization_weight=0.0005) reporter = ProgressPrinter(tag='training', num_epochs=max_amount_of_epochs) trainer = Trainer(model=model, criterion=(loss, metric), parameter_learners=[learner], progress_writers=[reporter]) log_number_of_parameters(model) map_input_to_streams_train = { features: reader_train.streams.features, label: reader_train.streams.labels } map_input_to_streams_test = { features: reader_test.streams.features, label: reader_test.streams.labels } training_session( trainer=trainer, mb_source=reader_train, model_inputs_to_streams=map_input_to_streams_train, mb_size=samples_per_minibatch, progress_frequency=samples_per_epoch, checkpoint_config=CheckpointConfig(frequency=samples_per_epoch, filename=os.path.join( output_directory, "ConvNet_Lego_VisiOn"), restore=False), test_config=TestConfig( reader_test, minibatch_size=samples_per_minibatch, model_inputs_to_streams=map_input_to_streams_test)).train() network = {'features': features, 'label': label, 'model': softmax(model)} return network
def test_learner_init(): i = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w = parameter(shape=(1, )) res = i * w #test new API: learning_parameter_schedule #explictly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1] * 3 + [0.2] * 2 + [0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner.learning_rate() == 0.1 learner.reset_learning_rate(learning_rate_schedule([1,2,3], UnitType.minibatch)); assert learner.learning_rate() == 1.0 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.nesterov(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) lr_per_sample = learning_rate_schedule([0.1]*3 +[0.2]*2 +[0.3], UnitType.sample) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3,0.1), (2, 0.2), (1, 0.3)], UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.1]*5 lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample, 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.set_default_use_mean_gradient_value(False) use_mean_gradient_value = C.default_use_mean_gradient_value() assert not use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample) C.set_default_use_mean_gradient_value(True) use_mean_gradient_value = C.default_use_mean_gradient_value() assert use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample)
def train_model(model_details, num_classes, train_map_file, learning_params, max_images=-1): num_epochs = learning_params["max_epochs"] epoch_size = sum(1 for _ in open(train_map_file)) if max_images > 0: epoch_size = min(epoch_size, max_images) mini_batch_size = learning_params["mb_size"] # Create the minibatch source and input variables mini_batch_source = create_mb_source(train_map_file, model_details["image_dims"], num_classes) image_input = cntk.input_variable(model_details["image_dims"]) label_input = cntk.input_variable(num_classes) # Define mapping from reader streams to network inputs input_map = { image_input: mini_batch_source["features"], label_input: mini_batch_source["labels"], } # Instantiate the transfer learning model and loss function tl_model = create_model( model_details, num_classes, image_input, freeze=learning_params["freeze_weights"], ) ce = cntk.cross_entropy_with_softmax(tl_model, label_input) pe = cntk.classification_error(tl_model, label_input) # Instantiate the trainer object lr_schedule = cntk.learning_parameter_schedule(learning_params["lr_per_mb"]) mm_schedule = cntk.momentum_schedule(learning_params["momentum_per_mb"]) learner = cntk.momentum_sgd( tl_model.parameters, lr_schedule, mm_schedule, l2_regularization_weight=learning_params["l2_reg_weight"], ) trainer = cntk.Trainer(tl_model, (ce, pe), [learner]) # Get mini_batches of images and perform model training print("Training transfer learning model for {0} epochs (epoch_size = {1}).".format(num_epochs, epoch_size)) cntk.logging.log_number_of_parameters(tl_model) progress_printer = cntk.logging.ProgressPrinter(tag="Training", num_epochs=num_epochs) # Loop over epochs for epoch in range(num_epochs): sample_count = 0 # Loop over mini_batches in the epoch while sample_count < epoch_size: data = mini_batch_source.next_minibatch(min(mini_batch_size, epoch_size - sample_count), input_map=input_map) # Update model with it trainer.train_minibatch(data) # Count samples processed so far sample_count += trainer.previous_minibatch_sample_count progress_printer.update_with_trainer(trainer, with_metric=True) if sample_count % (100 * mini_batch_size) == 0: print("Processed {0} samples".format(sample_count)) progress_printer.epoch_summary(with_metric=True) return tl_model
MOMENTUM_SCHEDULE_PARAMS = [ ((0.2,), [0.2]), ((0.2,), [0.2, 0.2, 0.2, 0.2]), (([0.2,0.4], 5), [0.2]*5+[0.4]*20), (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): l = learning_rate_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS) def test_learning_parameter_schedule(params, expectation, minibatch_size): l = learning_parameter_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation def sweep_based_schedule_fails():
# training config epoch_size = 6600 #12000 #15000 minibatch_size = 64 # Set training parameters lr_per_minibatch = learning_rate_schedule([0.01] * 10 + [0.003] * 10 + [0.001], UnitType.minibatch, epoch_size) momentum_time_constant = momentum_as_time_constant_schedule(-minibatch_size / np.log(0.9)) l2_reg_weight = 0.001 # trainer objectS progress_printer = ProgressPrinter(0) learner = momentum_sgd(z.parameters, lr=lr_per_minibatch, momentum=momentum_time_constant, l2_regularization_weight=l2_reg_weight) # ============================================================================= # Create or RESTORE trainer # ============================================================================= trainer = Trainer(z, (ce, pe), [learner], [progress_printer]) # trainer.restore_from_checkpoint(model_temp_file) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } #progress_printer = ProgressPrinter(tag='Training')
def test_learner_init_legacy(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner._learning_rate_schedule.minibatch_size == 1 # the deprecated per sample schedule should not use compatible mode assert learner.learning_rate() == 0.1 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch)) assert learner.is_compatible_mode() == False assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == 0 # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past. learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 #test the override in the new version assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value # back compatible API test momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], unit=UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_rate_schedule([0.1, 0.2], unit=UnitType.sample, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)
MOMENTUM_SCHEDULE_PARAMS = [ ((0.2,), [0.2]), ((0.2,), [0.2, 0.2, 0.2, 0.2]), (([0.2,0.4], 5), [0.2]*5+[0.4]*20), (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad(params, lr=learning_parameter_schedule(1)), lambda params: C.adam(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_parameter_schedule(1), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_parameter_schedule(1)), lambda params: C.momentum_sgd(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): l = learning_rate_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS) def test_learning_parameter_schedule(params, expectation, minibatch_size): l = learning_parameter_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation def sweep_based_schedule_fails():
def test_learner_init_legacy(): i = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w = parameter(shape=(1, )) res = i * w # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner._learning_rate_schedule.minibatch_size == 1 # the deprecated per sample schedule should not use compatible mode assert learner.learning_rate() == 0.1 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch)) assert learner.is_compatible_mode() == False assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == 0 # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate( learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past. learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 #test the override in the new version assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate( learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value # back compatible API test momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], unit=UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_rate_schedule([0.1, 0.2], unit=UnitType.sample, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)