def create_learner(model): '''Create the optimized method''' lr_per_minibatch = C.learning_rate_schedule(opt.lr, C.UnitType.minibatch) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_time_constant) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def create_learner(model): '''Create the optimized method''' lr_per_sample = C.learning_rate_schedule(opt.lr, C.UnitType.minibatch) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_sample) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_sample, momentum=momentum_time_constant) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_sample) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def create_learner(model): '''Create the optimized method''' lr_per_minibatch = C.learning_parameter_schedule(opt.lr) momentum_schedule = C.momentum_schedule_per_sample(0.9990913221888589) if opt.optim == 'sgd': return C.sgd(model.parameters, lr=lr_per_minibatch) elif opt.optim == 'adam': return C.adam(model.parameters, lr=lr_per_minibatch, momentum=momentum_schedule) elif opt.optim == 'adagrad': return C.adagrad(model.parameters, lr=lr_per_minibatch) else: raise RuntimeError("Invalid optim method: " + opt.optim)
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner.learning_rate() == 0.1 learner.reset_learning_rate(learning_rate_schedule([1,2,3], UnitType.minibatch)); assert learner.learning_rate() == 1.0 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.nesterov(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) lr_per_sample = learning_rate_schedule([0.1]*3 +[0.2]*2 +[0.3], UnitType.sample) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3,0.1), (2, 0.2), (1, 0.3)], UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.1]*5 lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample, 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.set_default_use_mean_gradient_value(False) use_mean_gradient_value = C.default_use_mean_gradient_value() assert not use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample) C.set_default_use_mean_gradient_value(True) use_mean_gradient_value = C.default_use_mean_gradient_value() assert use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample)
def TrainAndValidate(trainfile): #*****Hyper-Parameters****** q_max_words= 12 p_max_words = 50 emb_dim = 50 num_classes = 2 minibatch_size = 32 epoch_size = 500000 #No.of samples in training set total_epochs = 5 #Total number of epochs to run query_total_dim = q_max_words*emb_dim label_total_dim = num_classes passage_total_dim = p_max_words*emb_dim #****** Create placeholders for reading Training Data *********** query_input_var = C.ops.input_variable((1,q_max_words,emb_dim),np.float32,is_sparse=False) passage_input_var = C.ops.input_variable((1,p_max_words,emb_dim),np.float32,is_sparse=False) output_var = C.input_variable(num_classes,np.float32,is_sparse = False) train_reader = create_reader(trainfile, True, query_total_dim, passage_total_dim, label_total_dim) input_map = { query_input_var : train_reader.streams.queryfeatures, passage_input_var:train_reader.streams.passagefeatures, output_var : train_reader.streams.labels} # ********* Model configuration ******* model_output = cnn_network(query_input_var, passage_input_var, num_classes) loss = C.binary_cross_entropy(model_output, output_var) pe = C.classification_error(model_output, output_var) lr_per_minibatch = C.learning_rate_schedule(0.03, C.UnitType.minibatch) learner = C.adagrad(model_output.parameters, lr=lr_per_minibatch) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=total_epochs) #************Create Trainer with model_output object, learner and loss parameters************* trainer = C.Trainer(model_output, (loss, pe), learner, progress_printer) C.logging.log_number_of_parameters(model_output) ; print() # **** Train the model in batchwise mode ***** for epoch in range(total_epochs): # loop over epochs print("Epoch : ",epoch) sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = train_reader.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # training step sample_count += data[output_var].num_samples # count samples processed so far trainer.summarize_training_progress() model_output.save("data/models/CNN_{}.dnn".format(epoch)) # Save the model for every epoch #*** Find metrics on validation set after every epoch ******# (Note : you can skip doing this for every epoch instead to optimize the time, do it after every k epochs) predicted_labels=[] for i in range(len(validation_query_vectors)): queryVec = np.array(validation_query_vectors[i],dtype="float32").reshape(1,q_max_words,emb_dim) passageVec = np.array(validation_passage_vectors[i],dtype="float32").reshape(1,p_max_words,emb_dim) scores = model_output(queryVec,passageVec)[0] # do forward-prop on model to get score predictLabel = 1 if scores[1]>=scores[0] else 0 predicted_labels.append(predictLabel) metrics = precision_recall_fscore_support(np.array(validation_labels), np.array(predicted_labels), average='binary') #print("precision : "+str(metrics[0])+" recall : "+str(metrics[1])+" f1 : "+str(metrics[2])+"\n") return model_output
((0.2, UnitType.sample), [0.2, 0.2, 0.2, 0.2]), (([0.2, 0.4], UnitType.sample, 5), [0.2] * 5 + [0.4] * 20), (([(3, 0.2), (2, 0.4), (1, 0.8)], UnitType.sample, 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2, ), [0.2]), ((0.2, ), [0.2, 0.2, 0.2, 0.2]), (([0.2, 0.4], 5), [0.2] * 5 + [0.4] * 20), (([(3, 0.2), (2, 0.4), (1, 0.8)], 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad( params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule( 1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1,
def test_learner_init(): i = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w = parameter(shape=(1, )) res = i * w #test new API: learning_parameter_schedule #explictly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1] * 3 + [0.2] * 2 + [0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w #test new API: learning_parameter_schedule #explicitly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size = 1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1]*3 +[0.2]*2 +[0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3,0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1, epoch_size = 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
((0.2, 0), [0.2], 0), ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0), (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0), (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2,), [0.2]), ((0.2,), [0.2, 0.2, 0.2, 0.2]), (([0.2,0.4], 5), [0.2]*5+[0.4]*20), (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): l = learning_rate_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS) def test_learning_parameter_schedule(params, expectation, minibatch_size):
import cntk import numpy as np x_input = np.array([[1, 2, 3, 4, 5]], np.float32) y_input = np.array([[10]], np.float32) X = cntk.input_variables(5, np.float32) y = cntk.input_variables(1, np.float32) from cntk.layers import Dense, Sequential model = Sequential([Dense(32), Dense(1)])(X) loss = cntk.squared_error(model, y) learning_rate = 0.001 trainer = cntk.Trainer(model, (loss), cntk.adagrad(model.parameters, learning_rate)) for epoch in range(10): trainer.train_minibatch({X: x_input, y: y_input})
((0.2, 0), [0.2], 0), ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0), (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0), (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2,), [0.2]), ((0.2,), [0.2, 0.2, 0.2, 0.2]), (([0.2,0.4], 5), [0.2]*5+[0.4]*20), (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad(params, lr=learning_parameter_schedule(1)), lambda params: C.adam(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_parameter_schedule(1), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_parameter_schedule(1)), lambda params: C.momentum_sgd(params, lr=learning_parameter_schedule(1), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): l = learning_rate_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS) def test_learning_parameter_schedule(params, expectation, minibatch_size):