def build_graph(noise_shape, image_shape, G_progress_printer, D_progress_printer): input_dynamic_axes = [C.Axis.default_batch_axis()] Z = C.input_variable(noise_shape, dynamic_axes=input_dynamic_axes) X_real = C.input_variable(image_shape, dynamic_axes=input_dynamic_axes) X_real_scaled = 2*(X_real / 255.0) - 1.0 # Create the model function for the generator and discriminator models X_fake = generator(Z) D_real = discriminator(X_real_scaled) D_fake = D_real.clone( method = 'share', substitutions = {X_real_scaled.output: X_fake.output} ) # Create loss functions and configure optimazation algorithms G_loss = 1.0 - C.log(D_fake) D_loss = -(C.log(D_real) + C.log(1.0 - D_fake)) G_learner = C.fsadagrad( parameters = X_fake.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) D_learner = C.fsadagrad( parameters = D_real.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) DistG_learner = C.train.distributed.data_parallel_distributed_learner(G_learner) # The following API marks a learner as the matric aggregator, which is used by # the trainer to determine the training progress. # It is required, only when more than one learner is provided to a *single* trainer. # In this example, we use two trainers each with a single learner, so it # is not required and automatically set by CNTK for each single learner. However, if you # plan to use both learners with a single trainer, then it needs to be call before # creating the trainer. #DistG_learner.set_as_metric_aggregator() DistD_learner = C.train.distributed.data_parallel_distributed_learner(D_learner) # Instantiate the trainers G_trainer = C.Trainer( X_fake, (G_loss, None), DistG_learner, G_progress_printer ) D_trainer = C.Trainer( D_real, (D_loss, None), DistD_learner, D_progress_printer ) return X_real, X_fake, Z, G_trainer, D_trainer
def train_lm(testing=False): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) ; print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) last_avg_ce = 0 for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence : features, label_sequence : labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 last_avg_ce = av_ce num_trained_samples += token_count num_trained_samples_since_last_report += token_count if not testing: # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save(model_filename) print("Saved model to '%s'" % model_filename) return last_avg_ce
def create_trainer(self): try: p = self.output.parameters # Three of four parameters are learned by block_momentum_distributed_learner. bmd_learner = cntk.block_momentum_distributed_learner( cntk.momentum_sgd( [p[0], p[1], p[2]], cntk.learning_parameter_schedule(0.0001), cntk.momentum_as_time_constant_schedule(1000)), block_size=1000, block_learning_rate=0.01, block_momentum_as_time_constant=1000) # New API to mark which learner is to use for metric aggregaion. bmd_learner.set_as_metric_aggregator() # The last parameter is learned by the data_parallel_distributed_learner. momentum_schedule = cntk.momentum_schedule_per_sample( 0.9990913221888589) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.007) dpd_learner = cntk.data_parallel_distributed_learner( cntk.momentum_sgd([p[3]], lr_per_sample, momentum_schedule, True)) comm_rank = cntk.distributed.Communicator.rank() self.trainer = cntk.Trainer( self.output, (self.ce, self.err), [bmd_learner, dpd_learner], [ cntk.logging.ProgressPrinter( freq=progress_freq, tag="Training", rank=comm_rank) ]) except RuntimeError: self.trainer = None return
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer( StreamDefs(amazing_features=StreamDef( shape=feature_dim, context=(context, context), scp=features_file))) ld = HTKMLFDeserializer( label_mapping_file, StreamDefs( awesome_labels=StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd, ld]) features = C.sequence.input_variable(((2 * context + 1) * feature_dim)) labels = C.sequence.input_variable((num_classes)) model = Sequential( [For(range(3), lambda: Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error(z, labels) learner = C.fsadagrad( z.parameters, lr=C.learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), momentum=C.momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(freq=0) trainer = C.Trainer(z, (ce, errs), learner, progress_printer) input_map = { features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) assert True os.chdir(abs_path)
def create_learner(model): '''Create the optimized method''' lr_per_minibatch = C.learning_parameter_schedule(opt.lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_schedule) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def create_learner(model): '''Create the optimized method''' optim = "momentum_sgd" lr = 0.001 lr_per_sample = C.learning_parameter_schedule_per_sample(lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if optim == 'momentum_sgd': clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True return C.momentum_sgd(model.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation)
def Loss(self): # Evaluating old actions and values : logprobs, state_value, dist_entropy = self.policy.evaluate() # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling) c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs') ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs)) c_rewards = C.input_variable(1, name='rewards') advantages = c_rewards - C.stop_gradient(state_value) # Finding Surrogate Loss: surr1 = ratios * advantages surr2 = C.clip(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -0.01 * dist_entropy actor_loss = C.reduce_mean(neglog_loss + entropy_loss) critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards)) loss = actor_loss + critic_loss chunk = { 'neglog_loss': neglog_loss, 'entropy_loss': entropy_loss, 'actor_loss': actor_loss, 'critic_loss': critic_loss } trainer = C.Trainer( loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(self.lr), C.momentum_schedule_per_sample(self.betas[0]), variance_momentum=C.momentum_schedule_per_sample( self.betas[1]))) # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate return loss, chunk, trainer
def test_htk_deserializers(): mbsize = 640 epoch_size = 1000 * mbsize lr = [0.001] feature_dim = 33 num_classes = 132 context = 2 os.chdir(data_path) features_file = "glob_0000.scp" labels_file = "glob_0000.mlf" label_mapping_file = "state.list" fd = HTKFeatureDeserializer(StreamDefs( amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file))) ld = HTKMLFDeserializer(label_mapping_file, StreamDefs( awesome_labels = StreamDef(shape=num_classes, mlf=labels_file))) reader = MinibatchSource([fd,ld]) features = C.sequence.input_variable(((2*context+1)*feature_dim)) labels = C.sequence.input_variable((num_classes)) model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))), Dense(num_classes)]) z = model(features) ce = C.cross_entropy_with_softmax(z, labels) errs = C.classification_error (z, labels) learner = C.fsadagrad(z.parameters, lr=C.learning_parameter_schedule_per_sample(lr, epoch_size=epoch_size), momentum=C.momentum_schedule_per_sample(0.9990913221888589), gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True) progress_printer = C.logging.ProgressPrinter(freq=0) trainer = C.Trainer(z, (ce, errs), learner, progress_printer) input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels } # just run and verify it doesn't crash for i in range(3): mb_data = reader.next_minibatch(mbsize, input_map=input_map) trainer.train_minibatch(mb_data) assert True os.chdir(abs_path)
def train_lm(testing=False): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd( z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) last_avg_ce = 0 for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator( train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence: features, label_sequence: labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 last_avg_ce = av_ce num_trained_samples += token_count num_trained_samples_since_last_report += token_count if not testing: # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save(model_filename) print("Saved model to '%s'" % model_filename) return last_avg_ce
results = re.findall("Completed successfully.", str_out) if len(results) != 2: print(str_out) assert False if __name__=='__main__': in1 = C.input_variable(shape=1) labels = C.input_variable(shape=1) p1 = parameter(shape=1) p2 = parameter(shape=1) n = plus(in1, p1, name='n') z = plus(n, p2, name='z') ce = squared_error(z, labels) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) lr_per_sample = C.learning_parameter_schedule_per_sample(0.007) dist_learners = [ C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p1], lr_per_sample, momentum_schedule, True)), C.distributed.data_parallel_distributed_learner(C.momentum_sgd([p2], lr_per_sample, momentum_schedule, True)) ] trainer = C.Trainer(z, ce, dist_learners) in1_value = [[1]] label_value = [[0]] arguments = {in1: in1_value, labels: label_value} z_output = z.output def check_samples(learners, expected_number_of_samples): for learner in learners: if learner.total_number_of_samples_seen != expected_number_of_samples:
def build_graph(noise_shape, image_shape, G_progress_printer, D_progress_printer): ''' The rest of the computational graph is mostly responsible for coordinating the training algorithms and parameter updates, which is particularly tricky with GANs for couple reasons. First, the discriminator must be used on both the real MNIST images and fake images generated by the generator function. One way to represent this in the computational graph is to create a clone of the output of the discriminator function, but with substituted inputs. Setting method=share in the clone function ensures that both paths through the discriminator model use the same set of parameters. Second, we need to update the parameters for the generator and discriminator model separately using the gradients from different loss functions. We can get the parameters for a Function in the graph with the parameters attribute. However, when updating the model parameters, update only the parameters of the respective models while keeping the other parameters unchanged. In other words, when updating the generator we will update only the parameters of the GG function while keeping the parameters of the DD function fixed and vice versa. ''' input_dynamic_axes = [C.Axis.default_batch_axis()] Z = C.input_variable(noise_shape, dynamic_axes=input_dynamic_axes) X_real = C.input_variable(image_shape, dynamic_axes=input_dynamic_axes) X_real_scaled = 2*(X_real / 255.0) - 1.0 # Create the model function for the generator and discriminator models X_fake = generator(Z) D_real = discriminator(X_real_scaled) D_fake = D_real.clone( method = 'share', substitutions = {X_real_scaled.output: X_fake.output} ) # Create loss functions and configure optimazation algorithms G_loss = 1.0 - C.log(D_fake) D_loss = -(C.log(D_real) + C.log(1.0 - D_fake)) G_learner = C.fsadagrad( parameters = X_fake.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) D_learner = C.fsadagrad( parameters = D_real.parameters, lr = C.learning_parameter_schedule_per_sample(lr), momentum = C.momentum_schedule_per_sample(0.9985724484938566) ) # Instantiate the trainers G_trainer = C.Trainer( X_fake, (G_loss, None), G_learner, G_progress_printer ) D_trainer = C.Trainer( D_real, (D_loss, None), D_learner, D_progress_printer ) return X_real, X_fake, Z, G_trainer, D_trainer
def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Replace 0 with 1 to get detailed log. set_computation_network_trace_level(0) # These values must match for both train and test reader. image_height = train_reader.height image_width = train_reader.width num_channels = train_reader.channel_count sequence_length = train_reader.sequence_length num_output_classes = train_reader.label_count # Input variables denoting the features and label data input_var = C.input_variable( (num_channels, sequence_length, image_height, image_width), np.float32) label_var = C.input_variable(num_output_classes, np.float32) # Instantiate simple 3D Convolution network inspired by VGG network # and http://vlg.cs.dartmouth.edu/c3d/c3d_video.pdf with C.default_options(activation=C.relu): z = C.layers.Sequential([ C.layers.Convolution3D((3, 3, 3), 64, pad=True), C.layers.MaxPooling((1, 2, 2), (1, 2, 2)), C.layers.For( range(3), lambda i: [ C.layers.Convolution3D( (3, 3, 3), [96, 128, 128][i], pad=True), C.layers.Convolution3D( (3, 3, 3), [96, 128, 128][i], pad=True), C.layers.MaxPooling((2, 2, 2), (2, 2, 2)) ]), C.layers.For(range(2), lambda: [C.layers.Dense(1024), C.layers.Dropout(0.5)]), C.layers.Dense(num_output_classes, activation=None) ])(input_var) # loss and classification error. ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config train_epoch_size = train_reader.size() train_minibatch_size = 2 # Set learning parameters lr_per_sample = [0.01] * 10 + [0.001] * 10 + [0.0001] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size=train_epoch_size) momentum_per_sample = 0.9997558891748972 mm_schedule = C.momentum_schedule_per_sample([momentum_per_sample]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) log_number_of_parameters(z) print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs train_reader.reset() while train_reader.has_more(): videos, labels, current_minibatch = train_reader.next_minibatch( train_minibatch_size) trainer.train_minibatch({input_var: videos, label_var: labels}) trainer.summarize_training_progress() # Test data for trained model epoch_size = test_reader.size() test_minibatch_size = 2 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 test_reader.reset() while test_reader.has_more(): videos, labels, current_minibatch = test_reader.next_minibatch( test_minibatch_size) # minibatch data to be trained with metric_numer += trainer.test_minibatch({ input_var: videos, label_var: labels }) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def conv3d_ucf11(train_reader, test_reader, max_epochs=30): # Replace 0 with 1 to get detailed log. set_computation_network_trace_level(0) # These values must match for both train and test reader. image_height = train_reader.height image_width = train_reader.width num_channels = train_reader.channel_count sequence_length = train_reader.sequence_length num_output_classes = train_reader.label_count # Input variables denoting the features and label data input_var = C.input_variable((num_channels, sequence_length, image_height, image_width), np.float32) label_var = C.input_variable(num_output_classes, np.float32) # Instantiate simple 3D Convolution network inspired by VGG network # and http://vlg.cs.dartmouth.edu/c3d/c3d_video.pdf with C.default_options (activation=C.relu): z = C.layers.Sequential([ C.layers.Convolution3D((3,3,3), 64, pad=True), C.layers.MaxPooling((1,2,2), (1,2,2)), C.layers.For(range(3), lambda i: [ C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.Convolution3D((3,3,3), [96, 128, 128][i], pad=True), C.layers.MaxPooling((2,2,2), (2,2,2)) ]), C.layers.For(range(2), lambda : [ C.layers.Dense(1024), C.layers.Dropout(0.5) ]), C.layers.Dense(num_output_classes, activation=None) ])(input_var) # loss and classification error. ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config train_epoch_size = train_reader.size() train_minibatch_size = 2 # Set learning parameters lr_per_sample = [0.01]*10+[0.001]*10+[0.0001] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=train_epoch_size) momentum_per_sample = 0.9997558891748972 mm_schedule = C.momentum_schedule_per_sample([momentum_per_sample]) # Instantiate the trainer object to drive the model training learner = C.momentum_sgd(z.parameters, lr_schedule, mm_schedule, True) progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs train_reader.reset() while train_reader.has_more(): videos, labels, current_minibatch = train_reader.next_minibatch(train_minibatch_size) trainer.train_minibatch({input_var : videos, label_var : labels}) trainer.summarize_training_progress() # Test data for trained model epoch_size = test_reader.size() test_minibatch_size = 2 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 minibatch_index = 0 test_reader.reset() while test_reader.has_more(): videos, labels, current_minibatch = test_reader.next_minibatch(test_minibatch_size) # minibatch data to be trained with metric_numer += trainer.test_minibatch({input_var : videos, label_var : labels}) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
results = re.findall("Completed successfully.", str_out) if len(results) != 2: print(str_out) assert False if __name__ == '__main__': in1 = C.input_variable(shape=1) labels = C.input_variable(shape=1) p1 = parameter(shape=1) p2 = parameter(shape=1) n = plus(in1, p1, name='n') z = plus(n, p2, name='z') ce = squared_error(z, labels) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) lr_per_sample = C.learning_parameter_schedule_per_sample(0.007) learner1 = C.distributed.data_parallel_distributed_learner( C.momentum_sgd([p1], lr_per_sample, momentum_schedule, True)) learner1.set_as_metric_aggregator() dist_learners = [ learner1, C.distributed.data_parallel_distributed_learner( C.momentum_sgd([p2], lr_per_sample, momentum_schedule, True)) ] trainer = C.Trainer(z, ce, dist_learners) in1_value = [[1]] label_value = [[0]] arguments = {in1: in1_value, labels: label_value} z_output = z.output
def trainNetwork(): mapper, gens = loadData(dir + fileName, './data/Shakespeare', batchSize, timeSteps, timeShift, load=False, lineShape=(0, 40000)) # Input with dynamic sequence axis # consisting of a matrix of [steps-in-time X number-of-possible-characters] inputSeqAxis = cntk.Axis('inputAxis') input = cntk.sequence.input_variable((timeSteps, mapper.numClasses), sequence_axis=inputSeqAxis, name='input') model = createNetwork(input, layers, mapper.numClasses) label = cntk.sequence.input_variable(mapper.numClasses, sequence_axis=inputSeqAxis, name='label') z = model(input) loss = cntk.cross_entropy_with_softmax(z, label) error = cntk.classification_error(z, label) printer = cntk.logging.ProgressPrinter(tag='Training', freq=100, num_epochs=maxEpochs) lr_per_sample = cntk.learning_parameter_schedule_per_sample(0.001) momentum_schedule = cntk.momentum_schedule_per_sample(0.9990913221888589) learner = cntk.momentum_sgd(z.parameters, lr_per_sample, momentum_schedule, gradient_clipping_threshold_per_sample=5.0, gradient_clipping_with_truncation=True) #learner = cntk.momentum_sgd(z.parameters, lr, 0.9, minibatch_size=batchSize) #learner = cntk.fsadagrad(model.parameters, lr=lr, minibatch_size=batchSize, momentum=0.9, unit_gain=True) trainer = cntk.Trainer(z, (loss, error), learner, [printer]) numMinibatch = mapper.samples // batchSize print("Input sequence length: {}; unique characters {};".format( timeSteps, mapper.numClasses)) cntk.logging.log_number_of_parameters(z) print("Datset size {}; {} Epochs; {} minibatches per epoch".format( mapper.samples, maxEpochs, numMinibatch)) for epoch in range(maxEpochs): mask = [True] for mb in range(numMinibatch): X, Y = next(gens['train']) #X, Y = get_data(mb, batchSize, data, mapper) arguments = ({input: X, label: Y}, mask) mask = [False] trainer.train_minibatch(arguments) if mb % 100 == 0: print(generateText(z, mapper, 200) + '\n') trainer.summarize_training_progress() print(generateText(z, mapper, 100))
bn_update = [] for f in flows: q, log_det_J = f.forward(q, log_det_J) base_dist = MultivariateNormalDiag(loc=[0.]*c_dim, scale_diag=[1.]*c_dim) prior_logprob = base_dist.log_prob(q) # or C.log(base_dist.pdf(q)) loss = -C.reduce_mean(prior_logprob + log_det_J) v = np.r_[np.random.randn(512 // 2, 2) + np.array([5, 3]), np.random.randn(512 // 2, 2) + np.array([-5, 3])] v = (v - v.mean(axis=0)) / v.std(axis=0) lr_rate = 5e-3 learner = C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(lr_rate), C.momentum_schedule_per_sample(0.99)) # lr_rate = 1e-2 # learner = C.adam(loss.parameters, C.learning_parameter_schedule(lr_rate), C.momentum_schedule(0.99)) trainer = C.Trainer(loss, (loss, None), [learner]) for i in tqdm(range(500)): # v = np.random.uniform(size=(1000,c_dim)) # v = datasets.make_moons(n_samples=1000, noise=.05)[0].astype(np.float32) # v = np.r_[np.random.randn(512 // 2, 2) + np.array([5, 3]), # np.random.randn(512 // 2, 2) + np.array([-5, 3])] out = trainer.train_minibatch({loss.arguments[0]:v}, outputs=[prior_logprob, log_det_J]) if i%100 == 0: logprob = out[1][prior_logprob].mean() + out[1][log_det_J].mean()