def test_epochsize_wrn_for_parameter_schedule(): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") C.learning_parameter_schedule(0.01, minibatch_size=1, epoch_size=1000) assert len(w) == 1 assert issubclass(w[-1].category, RuntimeWarning) assert "epoch_size" in str(w[-1].message)
def test_ext_backpropstate(payload): class TestBackPropState(UserFunction): def __init__(self, arg, payload, name='f1'): self.payload = payload super(TestBackPropState, self).__init__([arg]) def infer_outputs(self): return [C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes)] def forward(self, argument, device=None, outputs_to_retain=None): return self.payload, argument def backward(self, state, root_gradients): assert state == self.payload return root_gradients dim = 4 p = C.parameter(shape=(dim,), init=10) in1 = C.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(TestBackPropState(in1, payload)) z = m + p lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size=1) trainer = C.Trainer(None, (z), [C.sgd(z.parameters, lr_per_sample)]) for i in range(100): input_data = np.random.rand(dim) trainer.train_minibatch({in1: [input_data]})
def test_model_not_criterion_subset(): input_dim = 2 proj_dim = 11 model1_dim = 3 model2_dim = 4 x = sequence.input_variable((input_dim,)) core = C.layers.Embedding(proj_dim) model1 = C.layers.Dense(model1_dim)(sequence.last(core(x))) model1_label = C.input_variable((model1_dim,)) ce_model1 = cross_entropy_with_softmax(model1, model1_label) pe_model1 = classification_error(model1, model1_label) model2 = C.layers.Dense(model2_dim)(core(x)) model2_label = sequence.input_variable((model2_dim,)) ce_model2 = cross_entropy_with_softmax(model2, model2_label) pe_model2 = classification_error(model2, model2_label) ce = 0.5 * sequence.reduce_sum(ce_model2) + 0.5 * ce_model1 lr_schedule = C.learning_parameter_schedule(0.003, minibatch_size =1) trainer_multitask = C.Trainer(model1, (ce, pe_model1), C.sgd(ce.parameters, lr=lr_schedule)) x_data = np.asarray([[2., 1.], [1., 2.]], np.float32) model1_label_data = np.asarray([1., 0., 0.], np.float32) model2_label_data = np.asarray([[0., 1., 0., 0.], [0., 0., 0., 1.]], np.float32) trainer_multitask.train_minibatch({x : [x_data], model1_label : [model1_label_data], model2_label : [model2_label_data]})
def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = C.input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=C.glorot_uniform()) B = parameter(shape=(proj_dim,), init=C.glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = C.input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = C.learning_parameter_schedule(0.1, minibatch_size =1) trainer = C.Trainer(z, (ce, pe), C.sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def test_empty_minibatch(): scalar = C.input_variable((1,), dtype=np.float32, name='tscalar') op = scalar + parameter(init=np.asarray([1]), dtype=np.float32) lr_per_sample = C.learning_parameter_schedule(0.1, minibatch_size =1) trainer = C.Trainer(op, (op, None), C.sgd(op.parameters, lr_per_sample)) trainer.train_minibatch({})
def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01] * 20 + [0.001] * 20 + [0.0001] * 20 + [0.00001] * 10 + [ 0.000001 ] lr_schedule = C.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = C.learners.momentum_sgd( network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def train(reader, model, loss_function, error_function, input_map, num_sweeps_to_train_with = 10, num_samples_per_sweep = 6000, minibatch_size = 64, learning_rate = 0.2): # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(model.parameters, lr_schedule) # Print progress progress_printer_stdout = ProgressPrinter(freq=minibatch_size) # Instantiate trainer trainer = C.Trainer(model, (loss_function, error_function), [learner], progress_writers=progress_printer_stdout) # Start a timer start = time.time() aggregate_metric = 0 total_samples = 0 num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size for i in range(0, int(num_minibatches_to_train)): # Read a mini batch from the training data file data = reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) samples = trainer.previous_minibatch_sample_count aggregate_metric += trainer.previous_minibatch_evaluation_average * samples total_samples += samples # Print training time print("Training took {:.1f} sec".format(time.time() - start)) print("Average error: {0:.2f}%".format((aggregate_metric * 100.0) / (total_samples))) return trainer
def create_trainer(self): try: p = self.output.parameters # Three of four parameters are learned by block_momentum_distributed_learner. bmd_learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd( [p[0], p[1], p[2]], cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) # New API to mark which learner is to use for metric aggregaion. bmd_learner.set_as_metric_aggregator() # The last parameter is learned by the data_parallel_distributed_learner. momentum_schedule = cntk.momentum_schedule_per_sample( 0.9990913221888589) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) dpd_learner = cntk.data_parallel_distributed_learner( cntk.momentum_sgd([p[3]], lr_per_sample, momentum_schedule, True)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [bmd_learner, dpd_learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def test_ext_backpropstate(payload): class TestBackPropState(UserFunction): def __init__(self, arg, payload, name='f1'): self.payload = payload super(TestBackPropState, self).__init__([arg]) def infer_outputs(self): return [ C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes) ] def forward(self, argument, device=None, outputs_to_retain=None): return self.payload, argument def backward(self, state, root_gradients): assert state == self.payload return root_gradients dim = 4 p = C.parameter(shape=(dim, ), init=10) in1 = C.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(TestBackPropState(in1, payload)) z = m + p lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size=1) trainer = C.Trainer(None, (z), [C.sgd(z.parameters, lr_per_sample)]) for i in range(100): input_data = np.random.rand(dim) trainer.train_minibatch({in1: [input_data]})
def test_trainer(tmpdir, no_eval_function): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]} trainer.save_checkpoint(p, external_state) restored_state = trainer.restore_from_checkpoint(p) assert external_state == restored_state assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], C.Learner)
def main(params): # Create output and log directories if they don't exist if not os.path.isdir(params['output_folder']): os.makedirs(params['output_folder']) if not os.path.isdir(params['log_folder']): os.makedirs(params['log_folder']) # Create the network network = create_network() # Create readers train_reader = cbf_reader(os.path.join(params['input_folder'], 'train{}.cbf'.format(params['prefix'])), is_training=True, max_samples=cntk.io.INFINITELY_REPEAT) cv_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False, max_samples=cntk.io.FULL_DATA_SWEEP) test_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False, max_samples=cntk.io.FULL_DATA_SWEEP) input_map = { network['input']: train_reader.streams.front, network['target']: train_reader.streams.label } # Create learner mm_schedule = momentum_schedule(0.90) lr_schedule = learning_parameter_schedule([(40, 0.1), (40, 0.01)], minibatch_size=params['minibatch_size']) learner = cntk.adam(network['model'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=0.0005, epoch_size=params['epoch_size'], minibatch_size=params['minibatch_size']) # Use TensorBoard for visual logging log_file = os.path.join(params['log_folder'], 'log.txt') pp_writer = cntk.logging.ProgressPrinter(freq=10, tag='Training', num_epochs=params['max_epochs'], log_to_file=log_file) tb_writer = cntk.logging.TensorBoardProgressWriter(freq=10, log_dir=params['log_folder'], model=network['model']) # Create trainer and training session trainer = Trainer(network['model'], (network['loss'], network['metric']), [learner], [pp_writer, tb_writer]) test_config = TestConfig(minibatch_source=test_reader, minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map) cv_config = CrossValidationConfig(minibatch_source=cv_reader, frequency=(1, DataUnit.sweep), minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map) checkpoint_config = CheckpointConfig(os.path.join(params['output_folder'], model_name), frequency=(10, DataUnit.sweep), restore=params['restore']) session = training_session(trainer=trainer, mb_source=train_reader, mb_size=params['minibatch_size'], model_inputs_to_streams=input_map, max_samples=params['epoch_size'] * params['max_epochs'], progress_frequency=(1, DataUnit.sweep), checkpoint_config=checkpoint_config, cv_config=cv_config, test_config=test_config) cntk.logging.log_number_of_parameters(network['model']) session.train() # Save the trained model path = os.path.join(params['output_folder'], 'final_model.dnn') network['model'].save(path) print('Saved final model to', path)
def train(reader, model_func, max_epochs=10, task='slot_tagging'): # Create the containers for input feature (x) and the label (y) x = C.sequence.input_variable(vocab_size) y = C.sequence.input_variable(num_labels) # Instantiate the model function; x is the input (feature) variable model = model_func(x) # Instantiate the loss and error function loss, label_error = create_criterion_function_preferred(model, y) # training config epoch_size = 18000 # 18000 samples is half the dataset size minibatch_size = 70 # LR schedule over epochs # In CNTK, an epoch is how often we get out of the minibatch loop to # do other stuff (e.g. checkpointing, adjust learning rate, etc.) lr_per_sample = [3e-4]*4+[1.5e-4] lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample] lr_schedule = C.learning_parameter_schedule(lr_per_minibatch, epoch_size=epoch_size) # Momentum schedule momentums = C.momentum_schedule(0.9048374180359595, minibatch_size=minibatch_size) # We use a the Adam optimizer which is known to work well on this dataset # Feel free to try other optimizers from # https://www.cntk.ai/pythondocs/cntk.learner.html#module-cntk.learner learner = C.adam(parameters=model.parameters, lr=lr_schedule, momentum=momentums, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) # Setup the progress updater progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) # Uncomment below for more detailed logging #progress_printer = ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=max_epochs) # Instantiate the trainer trainer = C.Trainer(model, (loss, label_error), learner, progress_printer) # process minibatches and perform model training C.logging.log_number_of_parameters(model) # Assign the data fields to be read from the input if task == 'slot_tagging': data_map={x: reader.streams.query, y: reader.streams.slot_labels} else: data_map={x: reader.streams.query, y: reader.streams.intent} t = 0 for epoch in range(max_epochs): # loop over epochs epoch_end = (epoch+1) * epoch_size while t < epoch_end: # loop over minibatches on the epoch data = reader.next_minibatch(minibatch_size, input_map= data_map) # fetch minibatch trainer.train_minibatch(data) # update model with it t += data[y].num_samples # samples so far trainer.summarize_training_progress()
def main(): show_image = False sigma_r = 8 grid_sz = 64 if show_image: sz = 256 n_chans = 3 bs = 1 data = skio.imread("/data/rgb.png").mean(2)[:sz, :sz].astype( np.float32) data = np.expand_dims(data / 255.0, 0) n_epochs = 1000 lr = 0.001 else: sz = 1024 n_chans = 3 bs = 4 N = 4 data = np.random.uniform(size=[N, sz, sz]).astype(np.float32) n_epochs = 50 lr = 0.000000001 imdata = np.tile(np.expand_dims(data, 1), [1, n_chans, 1, 1]) im = C.input_variable([n_chans, sz, sz], needs_gradient=True) guide = C.input_variable([sz, sz], needs_gradient=True) guide_no_grad = C.input_variable([sz, sz], needs_gradient=False) model = BilateralSlice(sz, n_chans, n_chans, sigma_r=sigma_r, grid_sz=grid_sz) out = model(im, guide, guide_no_grad) svg = C.logging.graph.plot(out, "/output/graph.svg") if show_image: # --- Show output ----------------------------------------------------------- inputs = {im: imdata[0], guide: data[0], guide_no_grad: data[0]} out_ = out.eval(inputs) out_ = np.clip(np.transpose(np.squeeze(out_), [1, 2, 0]), 0, 1) skio.imsave("/output/imout.png", out_) else: # --- Train ----------------------------------------------------------------- loss = C.squared_error(out, im) C.debugging.profiler.start_profiler("/output/pyprof") C.debugging.profiler.enable_profiler() learner = C.sgd(model.parameters, C.learning_parameter_schedule(lr)) progress_writer = C.logging.ProgressPrinter(0) begin = time.time() summary = loss.train((imdata, data, data), parameter_learners=[learner], callbacks=[progress_writer], max_epochs=n_epochs, minibatch_size=bs) end = time.time() runtime = (end - begin) * 1000.0 / n_epochs print('Runtime:', runtime) C.debugging.profiler.stop_profiler()
def test_sgd_with_noise(): # Runs a network where the number of parameters is odd # in some layers. This tests that cuRand library will not # complain about generating an odd number of random values np.random.seed(98052) learner = lambda params: sgd(params, lr=C.learning_parameter_schedule(0.125), gaussian_noise_injection_std_dev=0.01) ffnet(learner) # We just verify that we did not crash assert(True)
def train(nonlinearity, num_hidden_layers, device_id, minibatch_size=10, num_samples=1000): from cntk.cntk_py import always_allow_setting_default_device always_allow_setting_default_device() C.try_set_default_device(cntk_device(device_id)) np.random.seed(0) learning_rate = 0.5 lr_schedule = C.learning_parameter_schedule(learning_rate) hidden_layers_dim = 50 inp = C.input_variable((input_dim), np.float32) label = C.input_variable((num_output_classes), np.float32) z = fully_connected_classifier_net(inp, num_output_classes, hidden_layers_dim, num_hidden_layers, nonlinearity) loss = C.cross_entropy_with_softmax(z, label) eval_error = C.classification_error(z, label) learner = C.sgd(z.parameters, lr_schedule, minibatch_size=0) trainer = C.Trainer(z, (loss, eval_error), [learner]) num_minibatches_to_train = int(num_samples / minibatch_size) training_progress_output_freq = 20 losses = [] errors = [] for i in range(num_minibatches_to_train): features, labels = generate_random_data_sample(minibatch_size, input_dim, num_output_classes) # Specify the input variables mapping in the model to actual minibatch # data for training. trainer.train_minibatch({ inp: features, label: labels }, device=cntk_device(device_id)) batchsize, loss, error = print_training_progress( trainer, i, training_progress_output_freq) if not (loss == "NA" or error == "NA"): losses.append(loss) errors.append(error) return losses, errors
def test_universal(): np.random.seed(98052) builtin_sgd = lambda params: sgd(params, lr=C.learning_parameter_schedule(0.125)) builtin_last_avg_error, builtin_avg_error, _ = ffnet(builtin_sgd) np.random.seed(98052) my_sgd = lambda ps, gs: C.combine([C.assign(p, p - 0.125/25 * g) for p, g in zip(ps, gs)]) universal_sgd = lambda params: universal(my_sgd, params) my_last_avg_error, my_avg_error, _ = ffnet(universal_sgd) assert np.all(np.less_equal(my_last_avg_error, builtin_last_avg_error)) assert np.all(np.less_equal(my_avg_error, builtin_avg_error))
def train(): model = Model() z, loss, acc = model.model() progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file='log/log_' + version) ] lr = C.learning_parameter_schedule(learning_rate, minibatch_size=None, epoch_size=None) learner = C.adadelta(z.parameters, lr) trainer = C.Trainer(z, (loss, acc), learner, progress_writers) mb_source, input_map = deserialize(loss, train_data, model) mb_valid, valid_map = deserialize(loss, valid_data, model) try: trainer.restore_from_checkpoint('../model/' + version) except Exception: print('No checkpoint.') for epoch in range(max_epochs): # train num_seq = 0 with tqdm(total=epoch_size, ncols=79) as progress_bar: while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count progress_bar.update(trainer.previous_minibatch_sample_count) if num_seq >= epoch_size: break trainer.summarize_training_progress() trainer.save_checkpoint('../model/' + version + '/' + str(epoch)) # validation num_seq = 0 with tqdm(total=num_validation, ncols=79) as valid_progress_bar: while True: data = mb_valid.next_minibatch(minibatch_size, input_map=valid_map) if not data: break trainer.test_minibatch(data) num_seq += len(data) valid_progress_bar.update(len(data)) if num_seq >= num_validation: break trainer.summarize_test_progress()
def create_learner(model): '''Create the optimized method''' lr_per_minibatch = C.learning_parameter_schedule(opt.lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_schedule) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def test_scalar_loss_function(): import cntk as C x = C.input_variable((1,)) l = C.input_variable((2,)) proj = C.layers.Dense(2)(x) loss = C.reduce_sum(C.cross_entropy_with_softmax(proj, l), axis=C.Axis.all_axes()) * 1.0 lr_per_sample = C.learning_parameter_schedule(0.1, minibatch_size =1) trainer = C.Trainer(None, (loss, None), C.sgd(loss.parameters, lr_per_sample)) result = trainer.train_minibatch({x : np.asarray([[.1], [-.1]], dtype=np.float32), l : np.asarray([[0, 1], [1, 0]], dtype=np.float32)}) assert result assert trainer.total_number_of_samples_seen == 2
def train(self): tmp_d = {"x": [], "y": []} num_list = [] count = 0 for idx, value in enumerate(self.series): if idx % self.h_dims == 0: num_list = [] count += 1 if (self.h_dims * count) > len(self.series): break num_list.append(np.float32(value)) increment_list = [] for num in num_list: increment_list.append(num) tmp_d["x"].append(np.array(increment_list)) tmp_d["y"].append( np.array([np.float32(self.series[self.h_dims * count])])) x = {"train": tmp_d["x"]} y = {"train": np.array(tmp_d["y"])} z = self.create_model(self.input_node, self.h_dims) var_l = cntk.input_variable(1, dynamic_axes=z.dynamic_axes, name="y") learning_rate = 0.005 lr_schedule = cntk.learning_parameter_schedule(learning_rate) loss = cntk.squared_error(z, var_l) error = cntk.squared_error(z, var_l) momentum_schedule = cntk.momentum_schedule( 0.9, minibatch_size=self.batch_size) learner = cntk.fsadagrad(z.parameters, lr=lr_schedule, momentum=momentum_schedule) trainer = cntk.Trainer(z, (loss, error), [learner]) # training loss_summary = [] start = time.time() for epoch in range(0, self.epochs): for x_batch, l_batch in self.next_batch(x, y, "train", self.batch_size): trainer.train_minibatch({ self.input_node: x_batch, var_l: l_batch }) if epoch % (self.epochs / 10) == 0: training_loss = trainer.previous_minibatch_loss_average loss_summary.append(training_loss) print("epoch: {}, loss: {:.4f} [time: {:.1f}s]".format( epoch, training_loss, time.time() - start)) return z
def test_clone_freeze(): inputs = 3 outputs = 5 features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) weights = C.parameter((inputs, outputs)) const_weights = C.constant(weights.value) z = C.times(features, weights) c = C.times(features, const_weights) z_clone = z.clone('freeze') c_clone = c.clone('freeze') # check that z and z_clone are the same for p, q in zip(z.parameters, z_clone.constants): assert np.array_equal(p.value, q.value) # check that c and c_clone are the same for p, q in zip(c.constants, c_clone.constants): assert np.array_equal(p.value, q.value) # keep copies of the old values z_copies = [q.value for q in z_clone.constants] c_copies = [q.value for q in c_clone.constants] # update z trainer = C.Trainer( z, C.squared_error(z, label), C.sgd(z.parameters, C.learning_parameter_schedule(1.0))) x = np.random.randn(16, 3).astype('f') y = np.random.randn(16, 5).astype('f') trainer.train_minibatch({features: x, label: y}) # update c for cc in c.constants: cc.value = np.random.randn(*cc.value.shape).astype('f') # check that z changed for p, q in zip(z.parameters, z_clone.constants): assert not np.array_equal(p.value, q.value) # check that z_clone did not change for p, q in zip(z_copies, z_clone.constants): assert np.array_equal(p, q.value) # check that c changed for p, q in zip(c.constants, c_clone.constants): assert not np.array_equal(p.value, q.value) # check that c_clone did not change for p, q in zip(c_copies, c_clone.constants): assert np.array_equal(p, q.value)
def create_learner(model): '''Create the optimized method''' lr_per_minibatch = C.learning_parameter_schedule(opt.lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_schedule) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def lstm_basic(x, y, epochs=1000, batch_size=100, input_dim=5): x_axes = [C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()] C.input_variable(1, dynamic_axes=x_axes) # input sequences input_seq = C.sequence.input_variable(1) # create the model z = create_model(input_seq, input_dim) # expected output (label), also the dynamic axes of the model output # is specified as the model of the label input lb = C.input_variable(1, dynamic_axes=z.dynamic_axes, name="y") # the learning rate learning_rate = 0.02 lr_schedule = C.learning_parameter_schedule(learning_rate) # loss function loss = C.squared_error(z, lb) # use squared error to determine error for now error = C.squared_error(z, lb) # use fsadagrad optimizer momentum_schedule = C.momentum_schedule(0.9, minibatch_size=batch_size) learner = C.fsadagrad(z.parameters, lr=lr_schedule, momentum=momentum_schedule, unit_gain=True) trainer = C.Trainer(z, (loss, error), [learner]) # train loss_summary = [] start = time.time() for epoch in range(0, epochs): for x1, y1 in next_batch(x, y, "train", batch_size): trainer.train_minibatch({input_seq: x1, lb: y1}) if epoch % (epochs / 10) == 0: training_loss = trainer.previous_minibatch_loss_average loss_summary.append(training_loss) print("epoch: {}, loss: {:.4f} [time: {:.1f}s]".format( epoch, training_loss, time.time() - start)) print("training took {0:.1f} sec".format(time.time() - start)) return z, trainer, input_seq
def train(nonlinearity, num_hidden_layers, device_id, minibatch_size=10, num_samples=1000): from cntk.cntk_py import always_allow_setting_default_device always_allow_setting_default_device() C.try_set_default_device(cntk_device(device_id)) np.random.seed(0) learning_rate = 0.5 lr_schedule = C.learning_parameter_schedule(learning_rate) hidden_layers_dim = 50 inp = C.input_variable((input_dim), np.float32) label = C.input_variable((num_output_classes), np.float32) z = fully_connected_classifier_net(inp, num_output_classes, hidden_layers_dim, num_hidden_layers, nonlinearity) loss = C.cross_entropy_with_softmax(z, label) eval_error = C.classification_error(z, label) learner = C.sgd(z.parameters, lr_schedule, minibatch_size = 0) trainer = C.Trainer(z, (loss, eval_error), [learner]) num_minibatches_to_train = int(num_samples / minibatch_size) training_progress_output_freq = 20 losses = [] errors = [] for i in range(num_minibatches_to_train): features, labels = generate_random_data_sample(minibatch_size, input_dim, num_output_classes) # Specify the input variables mapping in the model to actual minibatch # data for training. trainer.train_minibatch({inp: features, label: labels}, device=cntk_device(device_id)) batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq) if not (loss == "NA" or error == "NA"): losses.append(loss) errors.append(error) return losses, errors
def trainAndTestOneFold(model, modelLabel, features, labels, features_test, labels_test): input = model.arguments[0] # Training loss = C.cross_entropy_with_softmax(model, modelLabel) eval_error = C.classification_error(model, modelLabel) # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(model.parameters, lr_schedule) trainer = C.Trainer(model, (loss, eval_error), [learner]) # Run the trainer and perform model training training_progress_output_freq = 20 plotdata = {"batchsize":[], "loss":[], "error":[]} for i in range(0, int(num_minibatches_to_train)): # Specify the input variables mapping in the model to actual minibatch data for training trainer.train_minibatch({input : features, modelLabel : labels}) batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=0) if not (loss == "NA" or error =="NA"): plotdata["batchsize"].append(batchsize) plotdata["loss"].append(loss) plotdata["error"].append(error) # Compute the moving average loss to smooth out the noise in SGD plotdata["avgloss"] = moving_average(plotdata["loss"]) plotdata["avgerror"] = moving_average(plotdata["error"]) # Graph data #showGraphs(plotdata) trainer.test_minibatch({input : features_test, modelLabel : labels_test}) out = C.softmax(model) predicted_label_probs = out.eval({input : features_test}) true_labels = [np.argmax(label) for label in labels_test] predicted_labels = [np.argmax(row) for row in predicted_label_probs] classificationRate, confusionMatrix = computeMetrics(true_labels, predicted_labels) print("Label :", true_labels) print("Predicted:", predicted_labels) print("Precision: ", classificationRate) print("Confusion Matrix:\n", confusionMatrix) return (classificationRate, confusionMatrix)
def create_trainer(self): learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ])
def ffnet(): inputs = 2 outputs = 2 layers = 2 hidden_dimension = 50 # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential( [Dense(hidden_dimension, activation=C.sigmoid), Dense(outputs)]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training lr_per_minibatch = C.learning_parameter_schedule(0.125) progress_printer = ProgressPrinter(0) trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)], [progress_printer]) # Get minibatches of training data and perform model training minibatch_size = 25 num_minibatches_to_train = 1024 aggregate_loss = 0.0 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data(minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({features: train_features, label: labels}) sample_count = trainer.previous_minibatch_sample_count aggregate_loss += trainer.previous_minibatch_loss_average * sample_count last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch({ features: test_features, label: test_labels }) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return last_avg_error, avg_error
def test_learner_update(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1,), init=w_init) res = i * w learner = sgd(res.parameters, lr=C.learning_parameter_schedule([0.1]*50 + [0.2]*50, minibatch_size = 1, epoch_size=1)) assert learner.learning_rate() == 0.1 x = learner.update({w: np.asarray([[2.]], dtype=np.float32)}, 100) assert learner.learning_rate() == 0.2 assert w.value < w_init learner.reset_learning_rate(learning_parameter_schedule([0.3]*50 + [0.4]*50, minibatch_size = 1, epoch_size=1)); assert learner.learning_rate() == 0.3 x = learner.update({w: np.asarray([[2.]], dtype=np.float32)}, 100) assert learner.learning_rate() == 0.4
def test_learner_update(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1,), init=w_init) res = i * w learner = sgd(res.parameters, lr=C.learning_parameter_schedule([0.1]*50 + [0.2]*50, minibatch_size = 1, epoch_size=1)) assert learner.learning_rate() == 0.1 x = learner.update({w: np.asarray([[2.]], dtype=np.float32)}, 100) assert learner.learning_rate() == 0.2 assert w.value < w_init learner.reset_learning_rate(learning_parameter_schedule([0.3]*50 + [0.4]*50, minibatch_size = 1, epoch_size=1)); assert learner.learning_rate() == 0.3 x = learner.update({w: np.asarray([[2.]], dtype=np.float32)}, 100) assert learner.learning_rate() == 0.4
def train (self, train_file, output_resources_pickle_file, \ network_type = 'unidirectional', \ num_epochs = 1, batch_size = 50, \ dropout = 0.2, reg_alpha = 0.0, \ num_hidden_units = 150, num_layers = 1): train_X, train_Y = self.reader.read_and_parse_training_data(train_file, output_resources_pickle_file) print("Data Shape: ") print(train_X.shape) # (15380, 613) print(train_Y.shape) # (15380, 613, 8) #self.wordvecs.shape (66962, 50) print("Hyper parameters:") print("output_resources_pickle_file = {}".format(output_resources_pickle_file)) print("network_type = {}".format(network_type)) print("num_epochs= {}".format(num_epochs )) print("batch_size = {}".format(batch_size )) print("dropout = ".format(dropout )) print("reg_alpha = {}".format(reg_alpha )) print("num_hidden_units = {}".format(num_hidden_units)) print("num_layers = {}".format(num_layers )) # Instantiate the model function; features = C.sequence.input_variable(self.wordvecs.shape[0]) labels = C.input_variable(train_Y.shape[2], dynamic_axes=[C.Axis.default_batch_axis()]) self.model = self.__create_model(features, train_Y.shape[2], num_hidden_units, dropout) plot_path = "./lstm_model.png" plot(self.model, plot_path) # Instantiate the loss and error function loss = C.cross_entropy_with_softmax(self.model, labels) error = C.classification_error(self.model, labels) # LR schedule learning_rate = 0.02 lr_schedule = C.learning_parameter_schedule(learning_rate) momentum_schedule = C.momentum_schedule(0.9, minibatch_size=batch_size) learner = C.fsadagrad(self.model.parameters, lr = lr_schedule, momentum = momentum_schedule, unit_gain = True) # Setup the progress updater progress_printer = C.logging.ProgressPrinter(freq=100, first=10, tag='Training', num_epochs=num_epochs) # Instantiate the trainer. We have all data in memory. https://github.com/Microsoft/CNTK/blob/master/Manual/Manual_How_to_feed_data.ipynb print('Start training') train_summary = loss.train((train_X.astype('float32'), train_Y.astype('float32')), parameter_learners=[learner], callbacks=[progress_printer])
def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001] lr_schedule = C.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = C.learners.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def _create(self, hidden): observation = C.input_variable(STATE_COUNT, name="s") q_target = C.input_variable(ACTION_COUNT, name="q") model = C.layers.Dense(hidden, activation=C.relu)(observation) model = C.layers.Dense(ACTION_COUNT)(model) # loss='mse' loss = C.reduce_mean(C.square(model - q_target)) #, axis=0) # optimizer lr = 0.00025 lr_schedule = C.learning_parameter_schedule(lr) learner = C.sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10) trainer = C.Trainer(model, (loss, None), learner) return model, trainer, loss
def _train(z, loss, eval_error, f_input, l_input, num_output_classes, steps): np.random.seed(0) input_dim = 2 lr_schedule = C.learning_parameter_schedule(0.5) #now we want the learning be compatible with the way in the literature without the per sample benefit: learner = sgd(z.parameters, lr_schedule, minibatch_size=C.learners.IGNORE) trainer = Trainer(z, (loss, eval_error), [learner]) minibatch_size = 10 for i in range(steps): features, labels = _generate_random_data_sample( minibatch_size, input_dim, num_output_classes) trainer.train_minibatch({f_input: features, l_input: labels})
def batch_step(self, previous_minibatch_loss=None): """ Updates learners with new learning rate after one training iteration is complete. Must be called once for every training iteration/update. """ self.last_batch_iteration += 1 lr = self.get_lr() self.current_lr = lr # loss and learn rate gets recorded in pre-training mode if self.record_history and previous_minibatch_loss: self.loss.append(previous_minibatch_loss) self.lrs.append(lr) self.parameter_learner.reset_learning_rate(C.learning_parameter_schedule(lr, minibatch_size=self.minibatch_size)) return None
def test_output_to_retain(): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1], [2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) assert np.allclose(var_map[z_output], np.asarray(in1_value)+20)
def test_model_one_output_of_multi_output_function(): input_dim = 2 proj_dim = 11 x = C.input_variable((input_dim,)) x_placeholder = C.placeholder() w = parameter((input_dim, proj_dim)) b = parameter((proj_dim,)) proj = times(x_placeholder, w) proj_plus_bias = proj + b combined_model = as_block(C.combine([proj, proj_plus_bias]), [(x_placeholder, x)], 'dense_op') labels = C.input_variable((proj_dim,)) lr_schedule = C.learning_parameter_schedule(0.003, minibatch_size =1) ce = cross_entropy_with_softmax(combined_model.outputs[0], labels) pe = classification_error(combined_model.outputs[0], labels) trainer_multitask = C.Trainer(combined_model.outputs[0], (ce, pe), C.sgd(ce.parameters, lr=lr_schedule))
def test_ext_lambdafunc(tmpdir): dim = 4 class CallbackCounter(object): def __init__(self): self.count = 0 def inc(self, arg): self.count += 1 cb = CallbackCounter() p = C.parameter(shape=(dim,), init=1) i = C.input_variable(dim, needs_gradient=True, name='i_var') k = i * p m = LambdaFunc(k, when=lambda arg: np.sum(arg) > 1, execute=cb.inc) m = C.user_function(m) z0 = m + 0 filepath = str(tmpdir / 'test_ext_lambdafunc.dat') z0.save(filepath) Function.register_udf_deserialize_callback('conditional_exec_lambda', lambda x, *unused: LambdaFunc(x, when=lambda arg: np.sum(arg) > 1, execute=cb.inc)) z = Function.load(filepath) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size = 1) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) i = 0 input_data = 0.1 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 0 input_data = 0.3 * np.ones(dim) trainer.train_minibatch([input_data]) assert cb.count == 1
def test_ext_train(tmpdir): dim = 4 p = C.parameter(shape=(dim,), init=10) i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m = MyPlus(i, C.constant(3), 'my_plus') # keeping m unwrapped since we need to access its member variables z = C.user_function(m) + p momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size = 1) trainer = C.Trainer(z, (z + 0, z + 0), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True, minibatch_size = 0)]) i = 0 while i < 100: i += 1 input_data = np.random.rand(dim) trainer.train_minibatch([input_data]) assert m.forward_calls == m.backward_calls == 100 filepath = str(tmpdir / 'test_ext_train.dat') z.save(filepath) buf = open(filepath, 'rb').read() # this is only need for Python 2.7 # (which does not distinguish between bytes and strings) if isinstance(buf, str): buf = bytearray(buf) z1 = Function.load(buf) m1 = z1.find_by_name('my_plus') # m1 is an instance of UserFunction, cannot directly downcast it to MyPlus, # using serialize as workaround: state = m1.serialize()['state'] assert state['forward_calls'] == state['backward_calls'] == 100
def ffnet(): inputs = 2 outputs = 2 layers = 2 hidden_dimension = 50 # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential ([ Dense(hidden_dimension, activation=C.sigmoid), Dense(outputs)]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training lr_per_minibatch = C.learning_parameter_schedule(0.125) progress_printer = ProgressPrinter(0) trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)], [progress_printer]) # Get minibatches of training data and perform model training minibatch_size = 25 num_minibatches_to_train = 1024 aggregate_loss = 0.0 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data(minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({features : train_features, label : labels}) sample_count = trainer.previous_minibatch_sample_count aggregate_loss += trainer.previous_minibatch_loss_average * sample_count last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch({features : test_features, label : test_labels}) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return last_avg_error, avg_error
def _train(z, loss, eval_error, f_input, l_input, num_output_classes, steps): np.random.seed(0) input_dim = 2 lr_schedule = C.learning_parameter_schedule(0.5) #now we want the learning be compatible with the way in the literature without the per sample benefit: learner = sgd(z.parameters, lr_schedule, minibatch_size = C.learners.IGNORE) trainer = Trainer(z, (loss, eval_error), [learner]) minibatch_size = 10 for i in range(steps): features, labels = _generate_random_data_sample( minibatch_size, input_dim, num_output_classes) trainer.train_minibatch({f_input: features, l_input: labels})
def _train_backcompatible_test(z, loss, eval_error, f_input, l_input, num_output_classes, steps): np.random.seed(0) input_dim = 2 lr_schedule = learning_parameter_schedule(0.5) learner = sgd(z.parameters, lr_schedule) trainer = Trainer(z, (loss, eval_error), [learner]) minibatch_size = 10 for i in range(steps): features, labels = _generate_random_data_sample( minibatch_size, input_dim, num_output_classes) trainer.train_minibatch({f_input: features, l_input: labels})
def test_udf_checkpointing(tmpdir): dev, w_value, c1_value, c2_value, op = build_test_function() label = C.constant(np.asarray([[1, 2], [3, 4]]).astype(np.float32)) loss = C.cross_entropy_with_softmax(op, label) eval_error = C.classification_error(op, label) lr_schedule = C.learning_parameter_schedule(0.5) learner = C.sgd(op.parameters, lr_schedule, minibatch_size = 0) trainer = C.Trainer(op, (loss, eval_error), [learner]) trainer.train_minibatch({op.arguments[0]: np.random.random((2, 2)).astype(np.float32)}, device=dev) filepath = str(tmpdir / 'test_checkpointing.out') trainer.save_checkpoint(filepath, external_state={'test': 'test'}) d = C.cntk_py.Dictionary.load(filepath) assert len(d.keys()) != 0
def run_distributed_training(tmpdir, create_func): in1 = sequence.input_variable(shape=1) labels = sequence.input_variable(shape=1) p = parameter(shape=2, init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, 1) dist_learner = create_func(C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)) communicator = dist_learner.communicator() workers = communicator.workers() current_worker = communicator.current_worker() found_rank = False for wk in workers: if current_worker.global_rank == wk.global_rank: found_rank = True assert found_rank trainer = C.Trainer(z, (ce, errs), [ dist_learner ]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) communicator.barrier() assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__
def create_trainer(network, epoch_size, num_quantization_bits, printer, block_size, warm_up, minibatch_size): # Set learning parameters lr_per_mb = [0.01]*25 + [0.001]*25 + [0.0001]*25 + [0.00001]*25 + [0.000001] lr_schedule = C.learning_parameter_schedule(lr_per_mb, minibatch_size=minibatch_size, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9, minibatch_size=minibatch_size) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") # Create learner local_learner = C.learners.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, minibatch_size=minibatch_size, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency # Create trainer if block_size != None: parameter_learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: parameter_learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, printer)
W = cntk.Parameter((input_dim, num_classes), init=cntk.glorot_uniform(), name='W') b = cntk.Parameter((num_classes,), init=0, name='b') model = cntk.times(data, W) + b # Define the CNTK criterion function. A criterion function maps # (input vectors, labels) to a loss function and an optional additional # metric. The loss function is used to train the model parameters. # We use cross entropy as a loss function. label_one_hot = cntk.input_variable(num_classes, is_sparse=True) loss = cntk.cross_entropy_with_softmax(model, label_one_hot) # this applies softmax to model's output under the hood metric = cntk.classification_error(model, label_one_hot) criterion = cntk.combine([loss, metric]) # criterion is a tuple-valued function (loss, metric) # Learner object. The learner implements the update algorithm, in this case plain SGD. learning_rate = 0.1 learner = cntk.sgd(model.parameters, cntk.learning_parameter_schedule(learning_rate)) # Trainer. minibatch_size = 32 progress_writer = cntk.logging.ProgressPrinter(50) # helper for logging progress; log every 50 minibatches trainer = cntk.Trainer(None, criterion, [learner], [progress_writer]) # Train! for i in range(0, len(X_train), minibatch_size): # loop over minibatches x = X_train[i:i+minibatch_size] # get one minibatch worth of data y = Y_train[i:i+minibatch_size] trainer.train_minibatch({data: x, label_one_hot: y}) # update model from one minibatch trainer.summarize_training_progress() # Test error rate on the test set. evaluator = cntk.Evaluator(metric, [progress_writer])
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [C.logging.ProgressPrinter( num_epochs = max_epochs, freq = log_freq, tag = 'Training', log_to_file = log_file, rank = C.Communicator.rank(), gen_heartbeat = gen_heartbeat)] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z) trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err' : 100, 'best_since' : 0, 'val_since' : 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config['distributed_after']: data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config['minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
def mem_leak_check(nonlinearity, num_hidden_layers, device_id, minibatch_size=1, num_samples=10000): from cntk.cntk_py import always_allow_setting_default_device always_allow_setting_default_device() C.try_set_default_device(cntk_device(device_id)) np.random.seed(0) learning_rate = 0.5 lr_schedule = C.learning_parameter_schedule(learning_rate) hidden_layers_dim = 50 inp = C.input_variable((input_dim), np.float32) label = C.input_variable((num_output_classes), np.float32) z = fully_connected_classifier_net(inp, num_output_classes, hidden_layers_dim, num_hidden_layers, nonlinearity) loss = C.cross_entropy_with_softmax(z, label) eval_error = C.classification_error(z, label) learner = C.sgd(z.parameters, lr_schedule, minibatch_size = 0) trainer = C.Trainer(z, (loss, eval_error), [learner]) num_minibatches_to_train = int(num_samples / minibatch_size) mem = np.zeros(num_minibatches_to_train) features, labels = generate_random_data_sample(minibatch_size, input_dim, num_output_classes) # Set a maximum fraction of iterations, in which the memory is allowed to # increase. Most likely these will be the first training runs. # Long-term this test needs to be run in a separate process over a longer # period of time. MEM_INCREASE_FRACTION_TOLERANCE = 0.01 # Set a maximum allowed memory increase. This tolerance should not be # exceeded when run as a standalone process (simply run this file with the # Python executable). MEM_INCREASE_TOLERANCE = 10*1024 dev = cntk_device(device_id) i = 0 proc = os_process() while i < num_minibatches_to_train: mem[i] = mem_used(proc) # Specify the input variables mapping in the model to actual minibatch # data for training. trainer.train_minibatch({inp: features, label: labels}, device=dev) i += 1 mem_deltas = np.diff(mem) iterations_with_mem_increase = (mem_deltas > 0).sum() mem_inc_fraction = iterations_with_mem_increase/num_minibatches_to_train mem_diff = mem[-1] - mem[10] if mem_inc_fraction > MEM_INCREASE_FRACTION_TOLERANCE and \ mem_diff > MEM_INCREASE_TOLERANCE: # For the rough leak estimation we take the memory footprint after the # dust of the first train_minibatch runs has settled. mem_changes = mem_deltas[mem_deltas != 0] raise ValueError('Potential memory leak of ~ %i KB (%i%% of MBs ' 'increased memory usage) detected with %s:\n%s' % (int(mem_diff/1024), int(mem_inc_fraction*100), nonlinearity, mem_changes))
def create_trainer(self): try: learner = cntk.block_momentum_distributed_learner(cntk.momentum_sgd(self.output.parameters, cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer(self.output, (self.ce, self.err), [learner], [cntk.logging.ProgressPrinter(freq=progress_freq, tag="Training", rank=comm_rank)]) except RuntimeError: self.trainer = None return
def one_step_sgd(loss, data, lr=0.1): learner = C.sgd(loss.parameters, C.learning_parameter_schedule(lr)) trainer = C.train.Trainer(loss, (loss, loss), learner, C.logging.ProgressPrinter(freq=0)) trainer.train_minibatch(data)