def create_trainer(network, epoch_size, num_quantization_bits, warm_up, progress_writers): ''' Create Trainer ''' print('Creating the trainer.') # Differential Learning rate scheduler lr_schedule = C.learning_rate_schedule([2.5], unit=C.UnitType.minibatch) mm_schedule = C.momentum_schedule(0.9) l2_reg_weight = 0.001 # Create the Adam learners learner = C.adam(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False) # Compute the number of workers num_workers = C.distributed.Communicator.num_workers() print('Number of workers: {}'.format(num_workers)) if num_workers > 1: parameter_learner = C.train.distributed.data_parallel_distributed_learner(learner, num_quantization_bits=num_quantization_bits) trainer = C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers) else: trainer = C.Trainer(network['output'], (network['ce'], network['pe']), learner, progress_writers) return trainer
def train(reader, model_func, max_epochs=10): # Instantiate the model function; x is the input (feature) variable model = model_func(x) # Instantiate the loss and error function loss, label_error = create_criterion_function_preferred(model, y) # training config epoch_size = 18000 # 18000 samples is half the dataset size minibatch_size = 70 # LR schedule over epochs # In CNTK, an epoch is how often we get out of the minibatch loop to # do other stuff (e.g. checkpointing, adjust learning rate, etc.) # (we don't run this many epochs, but if we did, these are good values) lr_per_sample = [0.003] * 4 + [0.0015] * 24 + [0.0003] lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample] lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size) # Momentum schedule momentum_as_time_constant = C.momentum_as_time_constant_schedule(700) # We use a the Adam optimizer which is known to work well on this dataset # Feel free to try other optimizers from # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner learner = C.sgd( parameters=model.parameters, lr=lr_schedule, #momentum=momentum_as_time_constant, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) # Setup the progress updater progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) # Uncomment below for more detailed logging #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # Instantiate the trainer trainer = C.Trainer(model, (loss, label_error), learner, progress_printer) # process minibatches and perform model training C.logging.log_number_of_parameters(model) t = 0 for epoch in range(max_epochs): # loop over epochs epoch_end = (epoch + 1) * epoch_size while t < epoch_end: # loop over minibatches on the epoch data = reader.next_minibatch( minibatch_size, input_map={ # fetch minibatch x: reader.streams.query, y: reader.streams.slot_labels }) trainer.train_minibatch(data) # update model with it t += data[y].num_samples # samples so far trainer.summarize_training_progress()
def create_distributed_learner(self, mode, config): local_learner = C.sgd( self.z.parameters, C.learning_rate_schedule(0.01, unit=C.learners.UnitType.sample)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner( local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig( block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner( local_learner, block_momentum_as_time_constant=config. block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner
def test_usermbsource_training(tmpdir): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_rate_schedule, sgd, Trainer, \ training_session, times, UnitType feature = sequence.input_variable(shape=(input_dim, )) label = C.input_variable(shape=(num_output_classes, )) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = {feature: mbs.fsi, label: mbs.lsi} session = training_session(trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20) session.train() assert trainer.total_number_of_samples_seen == 20
def __call__(self, parameters, opt_learning_rate=0.001, **kwargs): lr_per_minibatch = cntk.learning_rate_schedule( lr=opt_learning_rate, unit=cntk.UnitType.minibatch) momentum = cntk.momentum_schedule(momentum=0.99) return cntk.adam_sgd(parameters=parameters, lr=lr_per_minibatch, momentum=momentum)
def main(): cntk_info() args = arguments() get_data_tmp(folder='./data') test_data_available(folder='./data') model = Net() ## CNTK code without formatting train_reader = create_reader(ctf_train_file, True, model.input_dim, model.num_output_classes) test_reader = create_reader(ctf_test_file, False, model.input_dim, model.num_output_classes) # Print the output shapes / parameters of different components print("Output Shape of the first convolution layer:", model.z.first_conv.shape) print("Bias value of the last dense layer:", model.z.classify.b.value) # Number of parameters in the network cntk.logging.log_number_of_parameters(model.z) # Instantiate the trainer object to drive the model training learning_rate = 0.2 lr_schedule = cntk.learning_rate_schedule(learning_rate, cntk.UnitType.minibatch) learner = cntk.sgd(model.z.parameters, lr_schedule) trainer = cntk.Trainer(model.z, (model.loss, model.errs), [learner]) for epoch in range(args.epochs): train(epoch, model, train_reader, trainer, args) test(epoch, model, test_reader, trainer, args)
def create_trainer(network, epoch_size, num_quantization_bits): # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = cntk.learning_rate_schedule( lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0] * 20 + [600] * 20 + [1200] mm_schedule = cntk.learner.momentum_as_time_constant_schedule( mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner learner = data_parallel_distributed_learner( cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight), num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers): # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] lr_schedule = C.learning_rate_schedule(lr_per_sample, unit=C.learners.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0]*20 + [600]*20 + [1200] mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") local_learner = C.learners.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) if block_size != None: parameter_learner = C.train.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size) else: parameter_learner = C.train.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) # Create trainer return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)
def train(streamf): global net net=nn(input_var) loss = cntk.losses.squared_error(net,label_var) error=cntk.squared_error(net,label_var) learning_rate=0.01 lr_schedule=cntk.learning_rate_schedule(learning_rate,cntk.UnitType.minibatch) momentum_time_constant = cntk.momentum_as_time_constant_schedule(140 / -np.math.log(0.9)) learner=cntk.fsadagrad(net.parameters,lr=lr_schedule,momentum = momentum_time_constant,unit_gain = True) progres=cntk.logging.ProgressPrinter(0) trainer=cntk.Trainer(net,(loss,error),[learner],progress_writers=progres) input_map={ input_var : streamf.streams.features, label_var : streamf.streams.labels } minibatch_size = 1024 max_epochs = 500 epoch_size = 48985 t = 0 for epoch in range(max_epochs): epoch_end = (epoch+1) * epoch_size while t < epoch_end: dat1=streamf.next_minibatch(minibatch_size,input_map = input_map) trainer.train_minibatch(dat1) t += dat1[label_var].num_samples trainer.summarize_training_progress() return trainer
def __init__(self, n_in, n_out, init_lr, momentum): self.param1 = 512 self.param2 = 256 self.n_in = int(n_in) self.n_out = int(n_out) self.input = C.sequence.input_variable(shape=(self.n_in,)) self.label = C.sequence.input_variable(shape=(self.n_out,)) self.three_dnn = C.layers.Sequential([ C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_1'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_2'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_3')]) self.final_dnn = C.layers.Dense(self.n_out, name='dnn_final') self.dnn_1 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_1') self.dnn_2 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_2') self.dnn_3 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_3') self.dnn_4 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_4') self.list_bias = [] for i in xrange(16): self.list_bias.append(C.parameter(shape=(self.param2, ), name='bias_' + str(i))) self.output = self.model(self.input) self.loss = loss_fun(self.output, self.label) self.eval_err = loss_fun(self.output, self.label) self.lr_s = C.learning_rate_schedule(init_lr, C.UnitType.sample) self.mom_s = C.momentum_schedule(momentum) self.learner = C.momentum_sgd(self.output.parameters, lr=self.lr_s, momentum=self.mom_s) self.trainer = C.Trainer(self.output, (self.loss, self.eval_err), [self.learner])
def test_model_not_criterion_subset(): input_dim = 2 proj_dim = 11 model1_dim = 3 model2_dim = 4 x = sequence.input_variable((input_dim,)) core = C.layers.Embedding(proj_dim) model1 = C.layers.Dense(model1_dim)(sequence.last(core(x))) model1_label = C.input_variable((model1_dim,)) ce_model1 = cross_entropy_with_softmax(model1, model1_label) pe_model1 = classification_error(model1, model1_label) model2 = C.layers.Dense(model2_dim)(core(x)) model2_label = sequence.input_variable((model2_dim,)) ce_model2 = cross_entropy_with_softmax(model2, model2_label) pe_model2 = classification_error(model2, model2_label) ce = 0.5 * sequence.reduce_sum(ce_model2) + 0.5 * ce_model1 lr_schedule = C.learning_rate_schedule(0.003, C.UnitType.sample) trainer_multitask = C.Trainer(model1, (ce, pe_model1), C.sgd(ce.parameters, lr=lr_schedule)) x_data = np.asarray([[2., 1.], [1., 2.]], np.float32) model1_label_data = np.asarray([1., 0., 0.], np.float32) model2_label_data = np.asarray([[0., 1., 0., 0.], [0., 0., 0., 1.]], np.float32) trainer_multitask.train_minibatch({x : [x_data], model1_label : [model1_label_data], model2_label : [model2_label_data]})
def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = C.input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=C.glorot_uniform()) B = parameter(shape=(proj_dim,), init=C.glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = C.input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = C.learning_rate_schedule(0.1, C.UnitType.sample) trainer = C.Trainer(z, (ce, pe), C.sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def create_network(para, verbose=False): with cntk.layers.default_options(init=cntk.glorot_uniform(), activation=cntk.ops.relu): # In order to accelerate the debugging step, we choose a simple structure with only 2 parameters h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[0], strides=(1, 1), pad=True, name='C1')(network_input / 255.0) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2), )(h) h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[1], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2))(h) h = cntk.layers.Convolution2D(filter_shape=(3, 3), num_filters=para[2], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.Dense(para[3])(h) h = cntk.layers.Dropout(0.25)(h) z = cntk.layers.Dense(10, activation=None, name='R')(h) loss = cntk.cross_entropy_with_softmax(z, network_label) label_error = cntk.classification_error(z, network_label) lr_schedule = cntk.learning_rate_schedule(0.1, cntk.UnitType.minibatch) learner = cntk.momentum_sgd(z.parameters, lr_schedule, cntk.momentum_schedule(0.9)) trainer = cntk.Trainer(z, (loss, label_error), [learner]) if verbose: log = cntk.logging.ProgressPrinter(100) for _ in xrange(20000): data = train_reader.next_minibatch(100, input_map=mapping(train_reader)) trainer.train_minibatch(data) if verbose: log.update_with_trainer(trainer) return trainer
def test_trainer(tmpdir, no_eval_function): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]} trainer.save_checkpoint(p, external_state) restored_state = trainer.restore_from_checkpoint(p) assert external_state == restored_state assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], C.Learner)
def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01] * 20 + [0.001] * 20 + [0.0001] * 20 + [0.00001] * 10 + [ 0.000001 ] lr_schedule = cntk.learning_rate_schedule( lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd( network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def init_model(m): progress_writers = [ cntk.logging.ProgressPrinter( freq=int(BATCHSIZE / 2), rank=cntk.train.distributed.Communicator.rank(), num_epochs=EPOCHS) ] # Loss (dense labels); check if support for sparse labels loss = cntk.cross_entropy_with_softmax(m, labels) # Momentum SGD # https://github.com/Microsoft/CNTK/blob/master/Manual/Manual_How_to_use_learners.ipynb # unit_gain=False: momentum_direction = momentum*old_momentum_direction + gradient # if unit_gain=True then ...(1-momentum)*gradient local_learner = cntk.momentum_sgd( m.parameters, lr=cntk.learning_rate_schedule(LR, cntk.UnitType.minibatch), momentum=cntk.momentum_schedule(MOMENTUM), unit_gain=False) distributed_learner = cntk.train.distributed.data_parallel_distributed_learner( local_learner) trainer = cntk.Trainer(m, (loss, cntk.classification_error(m, labels)), [distributed_learner], progress_writers) return trainer, distributed_learner
def test_ext_backpropstate(payload): class TestBackPropState(UserFunction): def __init__(self, arg, payload, name='f1'): self.payload = payload super(TestBackPropState, self).__init__([arg]) def infer_outputs(self): return [ C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes) ] def forward(self, argument, device=None, outputs_to_retain=None): return self.payload, argument def backward(self, state, root_gradients): assert state == self.payload return root_gradients dim = 4 p = C.parameter(shape=(dim, ), init=10) in1 = C.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(TestBackPropState(in1, payload)) z = m + p lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(None, (z), [C.sgd(z.parameters, lr_per_sample)]) for i in range(100): input_data = np.random.rand(dim) trainer.train_minibatch({in1: [input_data]})
def train(streamf): global net minibatch_size = 1024 max_epochs = 2000 epoch_size = 50000 net = nn(input_var) loss = cntk.losses.binary_cross_entropy(net, label_var) error = cntk.classification_error(net, label_var) lr_per_sample = [3e-4] * 4 + [1.5e-4] lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample] lr_schedule = cntk.learning_rate_schedule(lr_per_minibatch, cntk.UnitType.minibatch) momentum_as_time_constant = cntk.momentum_as_time_constant_schedule(700) learner = cntk.adam(net.parameters, lr_schedule, momentum=momentum_as_time_constant, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progres = cntk.logging.ProgressPrinter(0) trainer = cntk.Trainer(net, (loss, error), [learner], progress_writers=progres) input_map = { input_var: streamf.streams.features, label_var: streamf.streams.labels } t = 0 for epoch in range(max_epochs): epoch_end = (epoch + 1) * epoch_size while t < epoch_end: dat1 = streamf.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(dat1) t += dat1[label_var].num_samples trainer.summarize_training_progress() return trainer
def test_empty_minibatch(): scalar = C.input_variable((1,), dtype=np.float32, name='tscalar') op = scalar + parameter(init=np.asarray([1]), dtype=np.float32) lr_per_sample = C.learning_rate_schedule(0.1, C.UnitType.sample) trainer = C.Trainer(op, (op, None), C.sgd(op.parameters, lr_per_sample)) trainer.train_minibatch({})
def test_ext_backpropstate(payload): class TestBackPropState(UserFunction): def __init__(self, arg, payload, name='f1'): self.payload = payload super(TestBackPropState, self).__init__([arg]) def infer_outputs(self): return [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)] def forward(self, argument, device=None, outputs_to_retain=None): return self.payload, argument def backward(self, state, root_gradients): assert state == self.payload return root_gradients dim = 4 p = C.parameter(shape=(dim,), init=10) in1 = C.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(TestBackPropState(in1, payload)) z = m + p lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(None, (z), [C.sgd(z.parameters, lr_per_sample)]) for i in range(100): input_data = np.random.rand(dim) trainer.train_minibatch({in1: [input_data]})
def __init__(self, feature_vector, target_vector, action_vector, velocity, load_model=True, testing=False, max_velocity=0.31, learning_rate=0.5, name='action_predicter'): self._load_model = load_model self._input_size = feature_vector self._output_size = action_vector self._target_size = target_vector self._velocity_size = velocity self._input = C.sequence.input_variable(self._input_size) self._target = C.sequence.input_variable(self._target_size) self._output = C.sequence.input_variable(self._output_size) self._output_velocity = C.sequence.input_variable(self._velocity_size) self.name = name self._max_velocity = max_velocity self._batch_size = 8 self._max_iter = 1000000 self._lr_schedule = C.learning_rate_schedule( [learning_rate * (0.999**i) for i in range(1000)], C.UnitType.sample, epoch_size=self._max_iter * self._batch_size) #self._model,self._loss, self._learner, self._trainer = self.create_model() if testing: self._model = self.load_models() else: self._model, self._loss, self._learner, self._trainer = self.create_model( ) self._predicted = {}
def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 2.0 # equal to 0.2 in caffe initial_learning_rate *= minibatch_size / 128 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def train(self, report_freq = 500, as_policy=True): #loss = C.ops.minus(0, C.ops.argmin(self.model) - C.ops.argmin(self.model) + C.ops.minus(self.label_var, 0)) loss = C.squared_error(self.model, self.label_var) evaluation = C.squared_error(self.model, self.label_var) schedule = C.momentum_schedule(self.hp.learning_rate) progress_printer = C.logging.ProgressPrinter(num_epochs=self.hp.epochs/self.hp.minibatch_size) learner = C.adam(self.model.parameters, C.learning_rate_schedule(self.hp.learning_rate, C.UnitType.minibatch), momentum=schedule, l1_regularization_weight=self.hp.l1reg, l2_regularization_weight=self.hp.l2reg ) trainer = C.Trainer(self.model, (loss, evaluation), learner, progress_printer) self.plotdata = {"loss":[]} for epoch in range(self.hp.epochs): indata, label, total_reward = self.get_next_data(self.hp.minibatch_size, as_policy) data = {self.input_var: indata, self.label_var: label} trainer.train_minibatch(data) loss = trainer.previous_minibatch_loss_average if not (loss == "NA"): self.plotdata["loss"].append(loss) if epoch % report_freq == 0: print() print("last epoch total reward: {}".format(total_reward)) trainer.summarize_training_progress() print() # if self.hp.stop_loss > loss: # break print() trainer.summarize_training_progress()
def train(train_reader, test_reader, model_func, num_sweeps_to_train_with=10): model = model_func(x/255) loss, label_error = create_criterion_function(model, y) learning_rate = 0.2 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner]) minibatch_size = util.BATCH_SIZE num_sumples_per_sweep = util.N num_minibatches_to_train = util.EPOCHS input_map = { y : train_reader.streams.labels, x : train_reader.streams.features } training_progress_output_freq = 500 train_loss = [] train_acc = [] for i in range(0, int(num_minibatches_to_train)): data = train_reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) print_training_progress(trainer, i, training_progress_output_freq, verbose=1) train_loss.append(trainer.previous_minibatch_loss_average) train_acc.append(1 - trainer.previous_minibatch_evaluation_average) return train_loss, train_acc
def __init__(self, feature_vector, target_vector, action_vector, velocity, max_velocity, learning_rate, name='action_predicter'): self._load_model = False self._input_size = (feature_vector[0] + 1, feature_vector[1]) self._output_size = action_vector self._target_size = target_vector self._velocity_size = velocity self._input = C.sequence.input_variable(self._input_size) self._target = C.sequence.input_variable(self._target_size) self._output = C.sequence.input_variable(self._output_size) self._output_velocity = C.sequence.input_variable(self._velocity_size) self.name = name self._max_velocity = max_velocity self._batch_size = 8 self._max_iter = 1000000 self._lr_schedule = C.learning_rate_schedule( [learning_rate * (0.995**i) for i in range(10000)], C.UnitType.sample, epoch_size=round(self._max_iter * self._batch_size / 100)) self._model, self._loss, self._learner, self._trainer = self.create_model( ) self._predicted = {}
def test_session_progress_print_on_sweep_unit(tmpdir, device_id): device = cntk_device(device_id) writer = MockProgressWriter() #set to a higher learning rate as we don't need to have converge but just to go through all the samples t, feature, label = create_sample_model( device, writer, lr_per_sample=C.learning_rate_schedule(0.3, C.UnitType.sample)) mbs = mb_source( tmpdir, "training", #max_samples=INFINITELY_REPEAT, max_sweeps=4) input_map = {feature: mbs.streams.features, label: mbs.streams.labels} test_dir = str(tmpdir) C.training_session( trainer=t, mb_source=mbs, mb_size=C.minibatch_size_schedule(5), model_inputs_to_streams=input_map, max_samples=FULL_DATA_SWEEP, progress_frequency=(2, C.train.DataUnit.sweep)).train(device) #4 sweeps of 25 samples = 100 samples assert (t.total_number_of_samples_seen == 100) #output every 2 epoch sweeps; 4 sweeps in total, at the end 2 outputs are written: assert (writer.training_summary_counter == 2)
def trainDNN(trainX, trainY): numOutputClasses = 2 newCol = np.where(trainY == 0, 1, 0) newCol = pd.DataFrame(newCol) trainY = trainY.reset_index(drop=True) trainY = pd.concat([trainY, newCol], axis=1, ignore_index=True) inputDim = trainX.shape[1] trainX = np.ascontiguousarray(trainX.as_matrix().astype(np.float32)) trainY = np.ascontiguousarray(trainY.as_matrix().astype(np.float32)) input = C.input_variable(inputDim) label = C.input_variable(numOutputClasses) classifier = create_model(input) loss = C.cross_entropy_with_softmax(classifier, label) evalError = C.classification_error(classifier, label) learning_rate = 0.5 lrSchedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(classifier.parameters, lrSchedule) trainer = C.Trainer(classifier, (loss, evalError), [learner]) minibatchSize = 25 numSamples = trainX.shape[0] - (trainX.shape[0] % 25) numMinibatchesToTrain = numSamples / minibatchSize #train the model for i in range(0, int(numMinibatchesToTrain)): trainX, trainY, features, labels = getMinibatch( trainX, trainY, minibatchSize) trainer.train_minibatch({input: features, label: labels}) return [classifier, trainer, input, label]
def create_trainer(network, epoch_size, num_quantization_bits, warm_up, progress_writers): print('Creating the trainer.') # Train only the last layers lr_schedule = C.learning_rate_schedule([0.01] * 10 + [0.001] * 20 + [0.0001] * 30, unit=C.UnitType.minibatch) mm_schedule = C.momentum_schedule(0.9) l2_reg_weight = 0.0001 learner = C.adam(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False) num_workers = C.distributed.Communicator.num_workers() print('Number of workers: {}'.format(num_workers)) if num_workers > 1: parameter_learner = C.train.distributed.data_parallel_distributed_learner( learner, num_quantization_bits=num_quantization_bits) trainer = C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers) else: trainer = C.Trainer(network['output'], (network['ce'], network['pe']), learner, progress_writers) return trainer
def create_trainer(): loss, label_error = create_criterion_function_preferred(dec, y) schedule_step = print_freq lr_per_sample = [2e-3] * 2 * schedule_step + [1e-3] * 2 * schedule_step + [ 5e-4 ] lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample] lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size) momentum_as_time_constant = C.momentum_as_time_constant_schedule(1000) learner = C.adam(parameters=dec.parameters, lr=lr_schedule, momentum=momentum_as_time_constant, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=num_epochs) trainer = C.Trainer(dec, (loss, label_error), learner, progress_printer) if restore: trainer.restore_from_checkpoint("model-5.cntk") C.logging.log_number_of_parameters(dec) return trainer
def train(streamf): global net minibatch_size = 512 max_epochs = 2000 epoch_size = 48985 net = nn(input_s, input_h, input_l, input_v) loss = cntk.losses.cross_entropy_with_softmax(net, label_var) error = cntk.classification_error(net, label_var) lr_per_sample = [3e-4] * 4 + [1.5e-4] lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample] lr_schedule = cntk.learning_rate_schedule(lr_per_minibatch, cntk.UnitType.minibatch) momentum_as_time_constant = cntk.momentum_as_time_constant_schedule(700) learner = cntk.fsadagrad(net.parameters, lr_schedule, momentum_as_time_constant) progres = cntk.logging.ProgressPrinter(0) trainer = cntk.Trainer(net, (loss, error), [learner], progress_writers=progres) input_map = { input_s: streamf.streams.spread, input_h: streamf.streams.high, input_l: streamf.streams.low, input_v: streamf.streams.volume, label_var: streamf.streams.labels } t = 0 for epoch in range(max_epochs): epoch_end = (epoch + 1) * epoch_size while t < epoch_end: dat1 = streamf.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(dat1) t += dat1[label_var].num_samples trainer.summarize_training_progress() return trainer
def train(streamf): input_var = cntk.input_variable(45,np.float32, name = 'features',dynamic_axes=cntk.axis.Axis.default_input_variable_dynamic_axes()) label_var=cntk.input_variable(3,np.float32, name = 'labels') net=nn(input_var) loss = cntk.squared_error(net,label_var) error=cntk.squared_error(net,label_var) learning_rate=0.02 lr_schedule=cntk.learning_rate_schedule(learning_rate,cntk.UnitType.minibatch) momentum_time_constant = cntk.momentum_as_time_constant_schedule(5000 / -np.math.log(0.9)) learner=cntk.fsadagrad(net.parameters,lr=lr_schedule,momentum = momentum_time_constant,unit_gain = True) progres=cntk.logging.ProgressPrinter(0) trainer=cntk.Trainer(net,(loss,error),[learner],progress_writers=progres) input_map={ input_var : streamf.streams.features, label_var : streamf.streams.labels } minibatch_size = 5000 num_samples_per_sweep = 2000 for i in range(0,num_samples_per_sweep): dat1=streamf.next_minibatch(minibatch_size,input_map = input_map) trainer.train_minibatch(dat1) training_loss = trainer.previous_minibatch_loss_average eval_error = trainer.previous_minibatch_evaluation_average if training_loss<0.002: break return trainer
def set_optimizer(self, opt_type, opt_conf): if opt_type == 'SGD': self.lr_schedule = C.learning_rate_schedule( opt_conf['lr'], C.UnitType.minibatch) self.m_schedule = C.momentum_schedule( opt_conf['momentum'], C.UnitType.minibatch) else: raise NotImplementedError
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd, ld]) features = C.input_variable(((2 * context + 1) * feature_dim)) labels = C.input_variable((num_classes)) model = Sequential( [For(range(3), lambda: Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error(z, labels) learner = C.adam_sgd(z.parameters, lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), momentum=C.momentum_as_time_constant_schedule(1000), low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) trainer = C.Trainer(z, (ce, errs), learner) input_map = { features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } pp = C.ProgressPrinter(freq=0) # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) pp.update_with_trainer(trainer, with_metric=True) assert True os.chdir(abs_path)
def create_trainer(use_sparse, device): a = C.input_variable(shape=input_shape, is_sparse=use_sparse, name='input') w = C.parameter(init=w_init, device=dev) z = times(a, w) l = C.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.007, C.UnitType.sample))) return (a, l, w, trainer)
def adjust_lr_callback(index, average_error, cv_num_samples, cv_num_minibatches): global prev_metric if (prev_metric - average_error) / prev_metric < 0.05: # relative gain must reduce metric by at least 5% rel learner.reset_learning_rate(C.learning_rate_schedule(learner.learning_rate() / 2, C.learners.UnitType.sample)) if learner.learning_rate() < lr_per_sample / (2**7-0.1): # we are done after the 6-th LR cut print("Learning rate {} too small. Training complete.".format(learner.learning_rate())) return False # means we are done print("Improvement of metric from {:.3f} to {:.3f} insufficient. Halving learning rate to {}.".format(prev_metric, average_error, learner.learning_rate())) prev_metric = average_error return True # means continue
def create_learner(model): '''Create the optimized method''' lr_per_sample = C.learning_rate_schedule(opt.lr, C.UnitType.minibatch) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_sample) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_sample, momentum=momentum_time_constant) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_sample) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def create_sample_model(device, writer=None): in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10, device=device) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) lr_per_sample = C.learning_rate_schedule( [0.3, 0.2, 0.1, 0.0], C.UnitType.sample) learner = C.sgd(z.parameters, lr_per_sample) trainer = C.Trainer(z, (ce, errs), [learner], writer) return (trainer, in1, labels)
def test_factor_dense_for_prediction(): input_dim = 2 num_output_classes = 2 hidden_layer_dim = 50 num_minibatches_to_train = 2000 minibatch_size = 25 learning_rate = 0.5 input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes) loss = C.cross_entropy_with_softmax(z, label) eval_error = C.classification_error(z, label) # Instantiate the trainer object to drive the model training lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, eval_error), [learner]) # Run the trainer and perform model training training_progress_output_freq = 20 plotdata = {"batchsize":[], "loss":[], "error":[]} for i in range(0, int(num_minibatches_to_train)): features, labels = _generate_random_data_sample(minibatch_size, input_dim, num_output_classes) # Specify the input variables mapping in the model to actual minibatch data for training trainer.train_minibatch({input : features, label : labels}) # generate some data to predict features, labels = _generate_random_data_sample(10, 2, 2) # factor the model. newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter) original_out = C.softmax(z) factored_out = C.softmax(newz) original_labels_probs = original_out.eval({input : features}) predicted_label_probs = factored_out.eval({input : features}) original_prediction_percentage = _percentage_match(labels, original_labels_probs) # reduced model should have at leat 50% match compared to the original # For the test, we reduced the training minibatches, thus the match is lower. assert(original_prediction_percentage * 0.5 <= _percentage_match(labels, predicted_label_probs))
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( awesome_labels = StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd,ld]) features = C.input_variable(((2*context+1)*feature_dim)) labels = C.input_variable((num_classes)) model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error (z, labels) learner = C.adam_sgd(z.parameters, lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size), momentum=C.momentum_as_time_constant_schedule(1000), low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) trainer = C.Trainer(z, (ce, errs), learner) input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } pp = C.ProgressPrinter(freq=0) # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) pp.update_with_trainer(trainer, with_metric=True) assert True os.chdir(abs_path)
def test_clone_freeze(): inputs = 3 outputs = 5 features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) weights = C.parameter((inputs, outputs)) const_weights = C.constant(weights.value) z = C.times(features, weights) c = C.times(features, const_weights) z_clone = z.clone('freeze') c_clone = c.clone('freeze') # check that z and z_clone are the same for p, q in zip(z.parameters, z_clone.constants): assert np.array_equal(p.value, q.value) # check that c and c_clone are the same for p, q in zip(c.constants, c_clone.constants): assert np.array_equal(p.value, q.value) # keep copies of the old values z_copies = [q.value for q in z_clone.constants] c_copies = [q.value for q in c_clone.constants] # update z trainer = C.Trainer(z, C.squared_error(z, label), C.sgd(z.parameters, C.learning_rate_schedule(1.0, C.UnitType.minibatch))) x = np.random.randn(16,3).astype('f') y = np.random.randn(16,5).astype('f') trainer.train_minibatch({features: x, label: y}) # update c for cc in c.constants: cc.value = np.random.randn(*cc.value.shape).astype('f') # check that z changed for p, q in zip(z.parameters, z_clone.constants): assert not np.array_equal(p.value, q.value) # check that z_clone did not change for p, q in zip(z_copies, z_clone.constants): assert np.array_equal(p, q.value) # check that c changed for p, q in zip(c.constants, c_clone.constants): assert not np.array_equal(p.value, q.value) # check that c_clone did not change for p, q in zip(c_copies, c_clone.constants): assert np.array_equal(p, q.value)
def train(nonlinearity, num_hidden_layers, device_id, minibatch_size=10, num_samples=1000): from cntk.cntk_py import always_allow_setting_default_device always_allow_setting_default_device() C.try_set_default_device(cntk_device(device_id)) np.random.seed(0) learning_rate = 0.5 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) hidden_layers_dim = 50 inp = C.input_variable((input_dim), np.float32) label = C.input_variable((num_output_classes), np.float32) z = fully_connected_classifier_net(inp, num_output_classes, hidden_layers_dim, num_hidden_layers, nonlinearity) loss = C.cross_entropy_with_softmax(z, label) eval_error = C.classification_error(z, label) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, eval_error), [learner]) num_minibatches_to_train = int(num_samples / minibatch_size) training_progress_output_freq = 20 losses = [] errors = [] for i in range(num_minibatches_to_train): features, labels = generate_random_data_sample(minibatch_size, input_dim, num_output_classes) # Specify the input variables mapping in the model to actual minibatch # data for training. trainer.train_minibatch({inp: features, label: labels}, device=cntk_device(device_id)) batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq) if not (loss == "NA" or error == "NA"): losses.append(loss) errors.append(error) return losses, errors
def test_data_resize(): batch_size = 8 w = C.parameter(shape=(3, 2), name='w1') x = C.input_variable(shape=[3], name='x') y = C.softmax(C.times(x, w)) y = C.unpack_batch(y) y = C.reshape(y, [batch_size * 2]) loss = C.reduce_mean(-C.log(y)) learning_rate = 0.01 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(y.parameters, lr_schedule, gradient_clipping_threshold_per_sample=1.0) trainer = C.Trainer(y, (loss), [learner]) features = np.random.randn(batch_size, 3) trainer.train_minibatch({x: features})
def create_distributed_learner(self, mode, config): local_learner = C.sgd(self.z.parameters, C.learning_rate_schedule(0.01, unit=C.learners.UnitType.sample)) try: if mode == 'data_parallel': if config is None: config = DataParallelConfig(num_quantization_bits=32, distributed_after=0) learner = C.data_parallel_distributed_learner(local_learner, num_quantization_bits=config.num_quantization_bits, distributed_after=config.distributed_after) elif mode == 'block_momentum': if config is None: # the default config to match data parallel SGD config = BlockMomentumConfig(block_momentum_as_time_constant=0, block_learning_rate=1, block_size=NUM_WORKERS, distributed_after=0) learner = C.block_momentum_distributed_learner(local_learner, block_momentum_as_time_constant=config.block_momentum_as_time_constant, block_learning_rate=config.block_learning_rate, block_size=config.block_size, distributed_after=config.distributed_after) else: learner = local_learner except RuntimeError: learner = None return learner
def create_trainer(use_sparse, device): a = C.input_variable(shape=input_shape, is_sparse=use_sparse, name='input') w_i = C.parameter(init=w_init_i, device=dev) a_projection = times(a, w_i) p_o = C.placeholder_variable() h = C.past_value(p_o) w_h = C.parameter(init=w_init_h, device=dev) h_projection = times(h, w_h) z = a_projection + h_projection z = z.replace_placeholder(z) z = reshape(z, label_shape) l = C.input_variable(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.007, C.UnitType.sample))) return (a, l, w_i, w_h, trainer)
def create_trainer(network, epoch_size, num_quantization_bits): # Set learning parameters lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001] lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
def test_usermbsource_training(tmpdir, with_checkpoint_impl): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) # Using this for testing the UserMinibatchSource checkpointing if with_checkpoint_impl: MBS_CV_CLASS = MyDataSourceWithCheckpoint else: MBS_CV_CLASS = MyDataSource mbs_cv = MBS_CV_CLASS(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_rate_schedule, sgd, Trainer, \ training_session, times, UnitType feature = sequence.input_variable(shape=(input_dim,)) label = C.input_variable(shape=(num_output_classes,)) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) lr_per_sample = learning_rate_schedule( [0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { feature: mbs.fsi, label: mbs.lsi } session = training_session( trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20, cv_config = C.CrossValidationConfig(minibatch_source=mbs_cv, max_samples=10, minibatch_size=2) ) session.train() assert trainer.total_number_of_samples_seen == 20 if with_checkpoint_impl: assert mbs_cv._restore_from_checkpoint_calls == 1
def train_sequence_classifier(): input_dim = 2000 hidden_dim = 25 embedding_dim = 50 num_classes = 5 # Input variables denoting the features and label data features = C.sequence.input_variable(shape=input_dim, is_sparse=True) label = C.input_variable(num_classes) # Instantiate the sequence classification model classifier_output = lstm_sequence_classifier(features, num_classes, embedding_dim, hidden_dim) ce = C.cross_entropy_with_softmax(classifier_output, label) pe = C.classification_error(classifier_output, label) rel_path = r"../../../../Tests/EndToEndTests/Text/SequenceClassification/Data/Train.ctf" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path) reader = create_reader(path, True, input_dim, num_classes) input_map = { features : reader.streams.features, label : reader.streams.labels } lr_per_sample = C.learning_rate_schedule(0.1, C.UnitType.sample) # Instantiate the trainer object to drive the model training progress_printer = C.logging.ProgressPrinter(0) trainer = C.Trainer(classifier_output, (ce, pe), C.sgd(classifier_output.parameters, lr=lr_per_sample), progress_printer) # Get minibatches of sequences to train with and perform model training minibatch_size = 200 for i in range(251): mb = reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) evaluation_average = copy.copy(trainer.previous_minibatch_evaluation_average) loss_average = copy.copy(trainer.previous_minibatch_loss_average) return evaluation_average, loss_average
def test_ext_lambdafunc(tmpdir): dim = 4 class CallbackCounter(object): def __init__(self): self.count = 0 def inc(self, arg): self.count += 1 cb = CallbackCounter() p = C.parameter(shape=(dim,), init=1) i = C.input_variable(dim, needs_gradient=True, name='i_var') k = i * p m = LambdaFunc(k, when=lambda arg: np.sum(arg) > 1, execute=cb.inc) m = C.user_function(m) z0 = m + 0 filepath = str(tmpdir / 'test_ext_lambdafunc.dat') z0.save(filepath) Function.register_udf_deserialize_callback('conditional_exec_lambda', lambda x, *unused: LambdaFunc(x, when=lambda arg: np.sum(arg) > 1, execute=cb.inc)) z = Function.load(filepath) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 input_data = 0.1 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 0 input_data = 0.3 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 1
def test_ext_train(tmpdir): dim = 4 p = C.parameter(shape=(dim,), init=10) i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m = MyPlus(i, C.constant(3), 'my_plus') # keeping m unwrapped since we need to access its member variables z = C.user_function(m) + p momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 while i < 100: i += 1 input_data = np.random.rand(dim) trainer.train_minibatch([input_data]) assert m.forward_calls == m.backward_calls == 100 filepath = str(tmpdir / 'test_ext_train.dat') z.save(filepath) buf = open(filepath, 'rb').read() # this is only need for Python 2.7 # (which does not distinguish between bytes and strings) if isinstance(buf, str): buf = bytearray(buf) z1 = Function.load(buf) m1 = z1.find_by_name('my_plus') # m1 is an instance of UserFunction, cannot directly downcast it to MyPlus, # using serialize as workaround: state = m1.serialize()['state'] assert state['forward_calls'] == state['backward_calls'] == 100
def test_restore_constants(tmpdir): C.device.try_set_default_device(C.device.cpu()) def _setvalue(x, v): x.value = 0 * x.value + v if len(x.shape)> 0 else np.array(v, dtype=np.float32) def _setall(f, v): for x in f.constants + f.parameters: _setvalue(x, v) def _checkall(f, v): for x in f.constants + f.parameters: assert (x.value == v).all() x = C.input_variable(10) f = C.layers.BatchNormalization()(x) trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_rate_schedule(0.1, 'sample'))) model_filename = str(tmpdir / 'function.out') checkpoint_filename = str(tmpdir / 'checkpoint.out') _setall(f, 1) f.save(model_filename) _checkall(f, 1) _setall(f, 2) trainer.save_checkpoint(checkpoint_filename) _checkall(f, 2) _setall(f, 3) _checkall(f, 3) trainer.restore_from_checkpoint(checkpoint_filename) _checkall(f, 2) f2 = C.Function.load(model_filename) _checkall(f2, 1) _setall(f, 4) _checkall(f, 4) f.restore(model_filename) _checkall(f, 1) _setall(f2, 5) _checkall(f2, 5)
def _train_backcompatible_test(z, loss, eval_error, f_input, l_input, num_output_classes, steps): np.random.seed(0) input_dim = 2 lr_schedule = learning_rate_schedule(0.5, UnitType.minibatch) learner = sgd(z.parameters, lr_schedule) trainer = Trainer(z, (loss, eval_error), [learner]) minibatch_size = 10 for i in range(steps): features, labels = _generate_random_data_sample( minibatch_size, input_dim, num_output_classes) trainer.train_minibatch({f_input: features, l_input: labels})
def test_udf_checkpointing(tmpdir): dev, w_value, c1_value, c2_value, op = build_test_function() label = C.constant(np.asarray([[1, 2], [3, 4]]).astype(np.float32)) loss = C.cross_entropy_with_softmax(op, label) eval_error = C.classification_error(op, label) lr_schedule = C.learning_rate_schedule(0.5, C.UnitType.minibatch) learner = C.sgd(op.parameters, lr_schedule) trainer = C.Trainer(op, (loss, eval_error), [learner]) trainer.train_minibatch({op.arguments[0]: np.random.random((2, 2)).astype(np.float32)}, device=dev) filepath = str(tmpdir / 'test_checkpointing.out') trainer.save_checkpoint(filepath, external_state={'test': 'test'}) d = C.cntk_py.Dictionary.load(filepath) assert len(d.keys()) != 0
def run_distributed_training(tmpdir, create_func): in1 = sequence.input_variable(shape=1) labels = sequence.input_variable(shape=1) p = parameter(shape=2, init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) dist_learner = create_func(C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)) communicator = dist_learner.communicator() workers = communicator.workers() current_worker = communicator.current_worker() found_rank = False for wk in workers: if current_worker.global_rank == wk.global_rank: found_rank = True assert found_rank trainer = C.Trainer(z, (ce, errs), [ dist_learner ]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) communicator.barrier() assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__
def create_trainer(network, epoch_size, num_quantization_bits, printer, block_size, warm_up): # Set learning parameters lr_per_mb = [0.01]*25 + [0.001]*25 + [0.0001]*25 + [0.00001]*25 + [0.000001] lr_schedule = C.learning_rate_schedule(lr_per_mb, unit=C.learners.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") # Create learner local_learner = C.learners.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency # Create trainer if block_size != None: parameter_learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: parameter_learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, printer)
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs = 80): _cntk_py.set_computation_network_trace_level(1) # Input variables denoting the features and label data input_var = cntk.ops.input_variable((num_channels, image_height, image_width)) label_var = cntk.ops.input_variable((num_classes)) # apply model to input scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) with cntk.layers.default_options (activation=cntk.ops.relu, pad=True): z = cntk.models.Sequential([ cntk.models.For(range(2), lambda : [ cntk.layers.Convolution2D((3,3), 64), cntk.layers.Convolution2D((3,3), 64), LocalResponseNormalization (1.0, 4, 0.001, 0.75), cntk.layers.MaxPooling((3,3), (2,2)) ]), cntk.models.For(range(2), lambda i: [ cntk.layers.Dense([256,128][i]), cntk.layers.Dropout(0.5) ]), cntk.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = cntk.ops.cross_entropy_with_softmax(z, label_var) pe = cntk.ops.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] lr_schedule = cntk.learning_rate_schedule(lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0]*20 + [600]*20 + [1200] mm_schedule = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain = True, l2_regularization_weight = l2_reg_weight) trainer = cntk.Trainer(z, (ce, pe), learner) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } cntk.utils.log_number_of_parameters(z) ; print() progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs) # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far progress_printer.update_with_trainer(trainer, with_metric=True) # log progress progress_printer.epoch_summary(with_metric=True) z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def convnet_mnist(debug_output=False, epoch_size=60000, minibatch_size=64, max_epochs=40): image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variables denoting the features and label data input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32) label_var = C.ops.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var) with C.layers.default_options(activation=C.ops.relu, pad=False): conv1 = C.layers.Convolution2D((5,5), 32, pad=True)(scaled_input) pool1 = C.layers.MaxPooling((3,3), (2,2))(conv1) conv2 = C.layers.Convolution2D((3,3), 48)(pool1) pool2 = C.layers.MaxPooling((3,3), (2,2))(conv2) conv3 = C.layers.Convolution2D((3,3), 64)(pool2) f4 = C.layers.Dense(96)(conv3) drop4 = C.layers.Dropout(0.5)(f4) z = C.layers.Dense(num_output_classes, activation=None)(drop4) ce = C.losses.cross_entropy_with_softmax(z, label_var) pe = C.metrics.classification_error(z, label_var) reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) # Set learning parameters lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001] lr_schedule = C.learning_rate_schedule(lr_per_sample, C.learners.UnitType.sample, epoch_size) mm_time_constant = [0]*5 + [1024] mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var : reader_train.streams.features, label_var : reader_train.streams.labels } C.logging.log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[label_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch))) # Load test data reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) input_map = { input_var : reader_test.streams.features, label_var : reader_test.streams.labels } # Test data for trained model epoch_size = 10000 minibatch_size = 1024 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def deconv_mnist(max_epochs=3): image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variable and normalization input_var = cntk.ops.input_variable((num_channels, image_height, image_width), np.float32) scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) # Define the auto encoder model cMap = 1 conv1 = cntk.layers.Convolution2D ((5,5), cMap, pad=True, activation=cntk.ops.relu)(scaled_input) pool1 = cntk.layers.MaxPooling ((4,4), (4,4))(conv1) unpool1 = cntk.layers.MaxUnpooling ((4,4), (4,4))(pool1, conv1) z = cntk.layers.ConvolutionTranspose2D((5,5), num_channels, pad=True, bias=False, init=cntk.glorot_uniform(0.001))(unpool1) # define rmse loss function (should be 'err = cntk.ops.minus(deconv1, scaled_input)') f2 = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) err = cntk.ops.reshape(cntk.ops.minus(z, f2), (784)) sq_err = cntk.ops.element_times(err, err) mse = cntk.ops.reduce_mean(sq_err) rmse_loss = cntk.ops.sqrt(mse) rmse_eval = cntk.ops.sqrt(mse) reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) # training config epoch_size = 60000 minibatch_size = 64 # Set learning parameters lr_schedule = cntk.learning_rate_schedule([0.00015], cntk.learner.UnitType.sample, epoch_size) mm_schedule = cntk.learner.momentum_as_time_constant_schedule([600], epoch_size) # Instantiate the trainer object to drive the model training learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True) progress_printer = cntk.utils.ProgressPrinter(tag='Training') trainer = cntk.Trainer(z, (rmse_loss, rmse_eval), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var : reader_train.streams.features } cntk.utils.log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[input_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "07_Deconvolution_PY_{}.model".format(epoch))) # rename final model last_model_name = os.path.join(model_path, "07_Deconvolution_PY_{}.model".format(max_epochs - 1)) final_model_name = os.path.join(model_path, "07_Deconvolution_PY.model") try: os.remove(final_model_name) except OSError: pass os.rename(last_model_name, final_model_name) # Load test data reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) input_map = { input_var : reader_test.streams.features } # Test data for trained model epoch_size = 10000 minibatch_size = 1024 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[input_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Replace 0 with 1 to get detailed log. set_computation_network_trace_level(0) # These values must match for both train and test reader. image_height = train_reader.height image_width = train_reader.width num_channels = train_reader.channel_count sequence_length = train_reader.sequence_length num_output_classes = train_reader.label_count # Input variables denoting the features and label data input_var = C.input_variable((num_channels, sequence_length, image_height, image_width), np.float32) label_var = C.input_variable(num_output_classes, np.float32) # Instantiate simple 3D Convolution network inspired by VGG network # and http://vlg.cs.dartmouth.edu/c3d/c3d_video.pdf with C.default_options (activation=C.relu): z = C.layers.Sequential([ C.layers.Convolution3D((3,3,3), 64, pad=True), C.layers.MaxPooling((1,2,2), (1,2,2)), C.layers.For(range(3), lambda i: [ C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.MaxPooling((2,2,2), (2,2,2)) ]), C.layers.For(range(2), lambda : [ C.layers.Dense(1024), C.layers.Dropout(0.5) ]), C.layers.Dense(num_output_classes, activation=None) ])(input_var) # loss and classification error. ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config train_epoch_size = train_reader.size() train_minibatch_size = 2 # Set learning parameters lr_per_sample = [0.01]*10+[0.001]*10+[0.0001] lr_schedule = C.learning_rate_schedule(lr_per_sample, epoch_size=train_epoch_size, unit=C.UnitType.sample) momentum_time_constant = 4096 mm_schedule = C.momentum_as_time_constant_schedule([momentum_time_constant]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs train_reader.reset() while train_reader.has_more(): videos, labels, current_minibatch = train_reader.next_minibatch(train_minibatch_size) trainer.train_minibatch({input_var : videos, label_var : labels}) trainer.summarize_training_progress() # Test data for trained model epoch_size = test_reader.size() test_minibatch_size = 2 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 test_reader.reset() while test_reader.has_more(): videos, labels, current_minibatch = test_reader.next_minibatch(test_minibatch_size) # minibatch data to be trained with metric_numer += trainer.test_minibatch({input_var : videos, label_var : labels}) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom