Python Trainer.save_checkpoint 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cntk

클래스/타입: Trainer

메소드/함수: save_checkpoint

hotexamples.com에서의 예제들: 6

Python Trainer.save_checkpoint - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cntk.Trainer.save_checkpoint에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Trainer(30)

test_minibatch(30)

train_minibatch(30)

summarize_training_progress(17)

summarize_test_progress(6)

_get_loss_metric(4)

save_checkpoint(4)

previous_minibatch_evaluation_average(2)

previous_minibatch_loss_average(2)

restore_from_checkpoint(2)

previous_minibatch_sample_count(1)

예제 #1

파일 보기

파일: CNTK_01.py 프로젝트: aflubenov/neuralnetworks

def entrenar(checkpoint, entrRuedas, entrOperaciones, input_dim, num_output_classes, testRuedas, testOperaciones):
    minibatch_size = 100;
    epocs=900;
    minibatchIteraciones = int(len(entrOperaciones) / minibatch_size);

    # Input variables denoting the features and label data
    feature = input((input_dim), np.float32)
    label = input((num_output_classes), np.float32)

    netout = crearRed(input_dim, num_output_classes, feature);

    ce = cross_entropy_with_softmax(netout, label)
    pe = classification_error(netout, label)

    lr_per_minibatch=learning_rate_schedule(0.25, UnitType.minibatch)
    # Instantiate the trainer object to drive the model training
    learner = sgd(netout.parameters, lr=lr_per_minibatch)
    progress_printer = ProgressPrinter(log_to_file=checkpoint+".log", num_epochs=epocs);
    trainer = Trainer(netout, (ce, pe), learner, progress_printer)


    if os.path.isfile(checkpoint):
        trainer.restore_from_checkpoint(checkpoint);

    npentrRuedas = np.array(entrRuedas).astype(np.float32);
    npentrOperaciones = np.array(entrOperaciones).astype(np.float32);

    #iteramos una vez por cada "epoc"
    for i in range(0, epocs):
        p = np.random.permutation(len(entrRuedas));
        npentrOperaciones = npentrOperaciones[p];
        npentrRuedas = npentrRuedas[p];

        #ahora partimos los datos en "minibatches" y entrenamos
        for j in range(0, minibatchIteraciones):
            features = npentrRuedas[j*minibatch_size:(j+1)*minibatch_size];
            labels = npentrOperaciones[j*minibatch_size:(j+1)*minibatch_size];
            trainer.train_minibatch({feature: features, label: labels});
        trainer.summarize_training_progress()
        
    
    trainer.save_checkpoint(checkpoint);



    minibatchIteraciones = int(len(testOperaciones) / minibatch_size);
    avg_error = 0;
    for j in range(0, minibatchIteraciones):

        test_features = np.array(testRuedas[j*minibatch_size:(j+1)*minibatch_size]).astype(np.float32);
        test_labels = np.array(testOperaciones[j*minibatch_size:(j+1)*minibatch_size]).astype(np.float32);
        #test_features = np.array( entrRuedas[0:minibatch_size]).astype(np.float32);
        #test_labels = np.array(entrOperaciones[0:minibatch_size]).astype(np.float32);
        avg_error = avg_error + ( trainer.test_minibatch(
            {feature: test_features, label: test_labels}) / minibatchIteraciones)

    return avg_error

예제 #2

파일 보기

def main(base_folder, training_mode='majority', model_name='VGG13', max_epochs = 100):

    # create needed folders.
    output_model_path   = os.path.join(base_folder, R'models')
    output_model_folder = os.path.join(output_model_path, model_name + '_' + training_mode)
    if not os.path.exists(output_model_folder):
        os.makedirs(output_model_folder)

    # creating logging file 
    logging.basicConfig(filename = os.path.join(output_model_folder, "train.log"), filemode = 'w', level = logging.INFO)
    logging.getLogger().addHandler(logging.StreamHandler())

    logging.info("Starting with training mode {} using {} model and max epochs {}.".format(training_mode, model_name, max_epochs))

    # create the model
    num_classes = len(emotion_table)
    model       = build_model(num_classes, model_name)

    # set the input variables.
    input_var = ct.input_variable((1, model.input_height, model.input_width), np.float32)
    label_var = ct.input_variable((num_classes), np.float32)
    
    # read FER+ dataset.
    logging.info("Loading data...")
    train_params        = FERPlusParameters(num_classes, model.input_height, model.input_width, training_mode, False)
    test_and_val_params = FERPlusParameters(num_classes, model.input_height, model.input_width, "majority", True)

    train_data_reader   = FERPlusReader.create(base_folder, train_folders, "label.csv", train_params)
    val_data_reader     = FERPlusReader.create(base_folder, valid_folders, "label.csv", test_and_val_params)
    test_data_reader    = FERPlusReader.create(base_folder, test_folders, "label.csv", test_and_val_params)
    
    # print summary of the data.
    display_summary(train_data_reader, val_data_reader, test_data_reader)
    
    # get the probalistic output of the model.
    z    = model.model(input_var)
    pred = ct.softmax(z)
    
    epoch_size     = train_data_reader.size()
    minibatch_size = 32

    # Training config
    lr_per_minibatch       = [model.learning_rate]*20 + [model.learning_rate / 2.0]*20 + [model.learning_rate / 10.0]
    mm_time_constant       = -minibatch_size/np.log(0.9)
    lr_schedule            = learning_rate_schedule(lr_per_minibatch, unit=UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule            = momentum_as_time_constant_schedule(mm_time_constant)

    # loss and error cost
    train_loss = cost_func(training_mode, pred, label_var)
    pe         = classification_error(z, label_var)

    # construct the trainer
    learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule)
    trainer = Trainer(z, (train_loss, pe), learner)

    # Get minibatches of images to train with and perform model training
    max_val_accuracy    = 0.0
    final_test_accuracy = 0.0
    best_test_accuracy  = 0.0

    logging.info("Start training...")
    epoch      = 0
    best_epoch = 0
    while epoch < max_epochs: 
        train_data_reader.reset()
        val_data_reader.reset()
        test_data_reader.reset()
        
        # Training 
        start_time = time.time()
        training_loss = 0
        training_accuracy = 0
        while train_data_reader.has_more():
            images, labels, current_batch_size = train_data_reader.next_minibatch(minibatch_size)

            # Specify the mapping of input variables in the model to actual minibatch data to be trained with
            trainer.train_minibatch({input_var : images, label_var : labels})

            # keep track of statistics.
            training_loss     += trainer.previous_minibatch_loss_average * current_batch_size
            training_accuracy += trainer.previous_minibatch_evaluation_average * current_batch_size
                
        training_accuracy /= train_data_reader.size()
        training_accuracy = 1.0 - training_accuracy
        
        # Validation
        val_accuracy = 0
        while val_data_reader.has_more():
            images, labels, current_batch_size = val_data_reader.next_minibatch(minibatch_size)
            val_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size
            
        val_accuracy /= val_data_reader.size()
        val_accuracy = 1.0 - val_accuracy
        
        # if validation accuracy goes higher, we compute test accuracy
        test_run = False
        if val_accuracy > max_val_accuracy:
            best_epoch = epoch
            max_val_accuracy = val_accuracy

            trainer.save_checkpoint(os.path.join(output_model_folder, "model_{}".format(best_epoch)))

            test_run = True
            test_accuracy = 0
            while test_data_reader.has_more():
                images, labels, current_batch_size = test_data_reader.next_minibatch(minibatch_size)
                test_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size
            
            test_accuracy /= test_data_reader.size()
            test_accuracy = 1.0 - test_accuracy
            final_test_accuracy = test_accuracy
            if final_test_accuracy > best_test_accuracy: 
                best_test_accuracy = final_test_accuracy
 
        logging.info("Epoch {}: took {:.3f}s".format(epoch, time.time() - start_time))
        logging.info("  training loss:\t{:e}".format(training_loss))
        logging.info("  training accuracy:\t\t{:.2f} %".format(training_accuracy * 100))
        logging.info("  validation accuracy:\t\t{:.2f} %".format(val_accuracy * 100))
        if test_run:
            logging.info("  test accuracy:\t\t{:.2f} %".format(test_accuracy * 100))
            
        epoch += 1

    logging.info("")
    logging.info("Best validation accuracy:\t\t{:.2f} %, epoch {}".format(max_val_accuracy * 100, best_epoch))
    logging.info("Test accuracy corresponding to best validation:\t\t{:.2f} %".format(final_test_accuracy * 100))
    logging.info("Best test accuracy:\t\t{:.2f} %".format(best_test_accuracy * 100))

예제 #3

파일 보기

        plot_data['batchindex'].append(batch_index)
        plot_data['loss'].append(trainer.previous_minibatch_loss_average)
        plot_data['error'].append(
            trainer.previous_minibatch_evaluation_average)

        progress_printer.update_with_trainer(trainer,
                                             with_metric=True)  # log progress
        batch_index += 1
        ev_avg += trainer.previous_minibatch_evaluation_average
        i_count += 1
#    if ev_avg/i_count < 0.02:
#        break
    progress_printer.epoch_summary(with_metric=True)
    #    print(epoch,' : ',ev_avg/i_count)
    try:
        trainer.save_checkpoint(model_temp_file)
    except:
        print('Problem with saving temp model')
#
# Evaluation action
#
epoch_size = 2200  #4000 #5000
minibatch_size = 32

# process minibatches and evaluate the model
metric_numer = 0
metric_denom = 0
sample_count = 0
minibatch_index = 0

input_map = {

예제 #4

파일 보기

파일: TrainResNet_CIFAR10_Distributed.py 프로젝트: ichraibi/CNTK

def train_and_evaluate(create_train_reader,
                       test_reader,
                       network_name,
                       max_epochs,
                       create_dist_learner,
                       scale_up=False):

    set_computation_network_trace_level(0)

    # Input variables denoting the features and label data
    input_var = input_variable((num_channels, image_height, image_width))
    label_var = input_variable((num_classes))

    # create model, and configure learning parameters
    if network_name == 'resnet20':
        z = create_cifar10_model(input_var, 3, num_classes)
        lr_per_mb = [1.0] * 80 + [0.1] * 40 + [0.01]
    elif network_name == 'resnet110':
        z = create_cifar10_model(input_var, 18, num_classes)
        lr_per_mb = [0.1] * 1 + [1.0] * 80 + [0.1] * 40 + [0.01]
    else:
        return RuntimeError("Unknown model name!")

    # loss and metric
    ce = cross_entropy_with_softmax(z, label_var)
    pe = classification_error(z, label_var)

    # shared training parameters
    epoch_size = 50000  # for now we manually specify epoch size

    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minimatch size on the same number of samples means less updates,
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    minibatch_size = 128 * (distributed.Communicator.num_workers()
                            if scale_up else 1)

    momentum_time_constant = -minibatch_size / np.log(0.9)
    l2_reg_weight = 0.0001

    # Set learning parameters
    lr_per_sample = [lr / minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample,
                                         epoch_size=epoch_size,
                                         unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)

    # trainer object
    learner = create_dist_learner(
        momentum_sgd(z.parameters,
                     lr_schedule,
                     mm_schedule,
                     unit_gain=True,
                     l2_regularization_weight=l2_reg_weight))
    trainer = Trainer(z, ce, pe, learner)

    total_number_of_samples = max_epochs * epoch_size
    train_reader = create_train_reader(total_number_of_samples)

    # define mapping from reader streams to network inputs
    input_map = {
        input_var: train_reader.streams.features,
        label_var: train_reader.streams.labels
    }

    log_number_of_parameters(z)
    print()
    progress_printer = ProgressPrinter(tag='Training')

    # perform model training
    current_epoch = 0
    updated = True
    while updated:
        data = train_reader.next_minibatch(
            minibatch_size, input_map=input_map)  # fetch minibatch.
        updated = trainer.train_minibatch(data)  # update model with it
        progress_printer.update_with_trainer(trainer,
                                             with_metric=True)  # log progress
        epoch_index = int(trainer.total_number_of_samples_seen / epoch_size)
        if current_epoch != epoch_index:  # new epoch reached
            progress_printer.epoch_summary(with_metric=True)
            current_epoch = epoch_index
            trainer.save_checkpoint(
                os.path.join(model_path,
                             network_name + "_{}.dnn".format(current_epoch)))

    # Evaluation parameters
    epoch_size = 10000
    minibatch_size = 16

    # process minibatches and evaluate the model
    metric_numer = 0
    metric_denom = 0
    sample_count = 0
    minibatch_index = 0

    while True:
        data = test_reader.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break

        local_mb_samples = data[label_var].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1

    print("")
    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(
        minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
        metric_denom))
    print("")

    return metric_numer / metric_denom

예제 #5

파일 보기


# Initialize the parameters for the trainer
minibatch_size = 64
num_samples_per_sweep = 60000
num_sweeps_to_train_with = 10
num_minibatches_to_train = (num_samples_per_sweep *
                            num_sweeps_to_train_with) / minibatch_size

# Run the trainer on and perform model training
training_progress_output_freq = 500

plotdata = {"batchsize": [], "loss": [], "error": []}

for i in range(0, int(num_minibatches_to_train)):
    mb = mb_source.next_minibatch(minibatch_size)

    # Specify the input variables mapping in the model to actual minibatch data to be trained
    arguments = {input: mb[features_si], label: mb[labels_si]}
    trainer.train_minibatch(arguments)
    batchsize, loss, error = print_training_progress(
        trainer, i, training_progress_output_freq, verbose=1)

    if not (loss == "NA" or error == "NA"):
        plotdata["batchsize"].append(batchsize)
        plotdata["loss"].append(loss)
        plotdata["error"].append(error)

trainer.save_checkpoint("../output/model")
save_metrics(trainer, '../output/metrics.txt')

예제 #6

파일 보기

파일: TrainResNet_CIFAR10_Distributed.py 프로젝트: Microsoft/CNTK

def train_and_evaluate(create_train_reader, test_reader, network_name, max_epochs, create_dist_learner, scale_up=False):

    set_computation_network_trace_level(0)

    # Input variables denoting the features and label data
    input_var = input_variable((num_channels, image_height, image_width))
    label_var = input_variable((num_classes))

    # create model, and configure learning parameters 
    if network_name == 'resnet20': 
        z = create_cifar10_model(input_var, 3, num_classes)
        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
    elif network_name == 'resnet110': 
        z = create_cifar10_model(input_var, 18, num_classes)
        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
    else: 
        return RuntimeError("Unknown model name!")

    # loss and metric
    ce = cross_entropy_with_softmax(z, label_var)
    pe = classification_error(z, label_var)

    # shared training parameters 
    epoch_size = 50000                    # for now we manually specify epoch size
    
    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minimatch size on the same number of samples means less updates, 
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    minibatch_size = 128 * (distributed.Communicator.num_workers() if scale_up else 1)

    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001

    # Set learning parameters
    lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
    
    # trainer object
    learner     = create_dist_learner(momentum_sgd(z.parameters, lr_schedule, mm_schedule,
                                                   l2_regularization_weight = l2_reg_weight))
    trainer     = Trainer(z, ce, pe, learner)

    total_number_of_samples = max_epochs * epoch_size
    train_reader=create_train_reader(total_number_of_samples)

    # define mapping from reader streams to network inputs
    input_map = {
        input_var: train_reader.streams.features,
        label_var: train_reader.streams.labels
    }

    log_number_of_parameters(z) ; print()
    progress_printer = ProgressPrinter(tag='Training')

    # perform model training
    current_epoch=0
    updated=True
    while updated:
        data=train_reader.next_minibatch(minibatch_size, input_map=input_map) # fetch minibatch.
        updated=trainer.train_minibatch(data)                                 # update model with it
        progress_printer.update_with_trainer(trainer, with_metric=True)       # log progress
        epoch_index = int(trainer.total_number_of_samples_seen/epoch_size)
        if current_epoch != epoch_index:                                      # new epoch reached
            progress_printer.epoch_summary(with_metric=True)
            current_epoch=epoch_index
            trainer.save_checkpoint(os.path.join(model_path, network_name + "_{}.dnn".format(current_epoch)))

    # Evaluation parameters
    epoch_size     = 10000
    minibatch_size = 16

    # process minibatches and evaluate the model
    metric_numer    = 0
    metric_denom    = 0
    sample_count    = 0
    minibatch_index = 0

    while True:
        data = test_reader.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break;

        local_mb_samples=data[label_var].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1

    print("")
    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom))
    print("")

    return metric_numer/metric_denom