Ejemplos de data_parallel_distributed_trainer en Python, ejemplos de cntk.distributed.data_parallel_distributed_trainer en Python

Ejemplo n.º 1

0

Mostrar archivo

def test_cifar_convnet_error(device_id):
    if platform.system() == 'Windows':
        pytest.skip('test skipped on Windows')

    set_default_device(cntk_device(device_id))

    try:
        base_path = os.path.join(
            os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
            *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
        #      and CIFAR-10_mean.xml in the base_path.
    except KeyError:
        base_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)
    os.chdir(os.path.join(base_path, '..'))

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1)
    set_fixed_random_seed(
        1
    )  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'),
                                 os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                 True, 0)
    reader_test = create_reader(os.path.join(base_path, 'test_map.txt'),
                                os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                False)

    distributed_after_samples = 0
    num_quantization_bits = 32
    distributed_trainer = distributed.data_parallel_distributed_trainer(
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)

    test_error = convnet_cifar10_dataaug(reader_train,
                                         reader_test,
                                         distributed_trainer,
                                         max_epochs=1)
    expected_test_error = 0.617

    assert np.allclose(test_error,
                       expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cifar_resnet_distributed_test.py Proyecto: nagyistge/Microsoft.CNTK

def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd):
    if cntk_device(device_id).type() != DeviceKind_GPU:
        pytest.skip('test only runs on GPU')
    set_default_device(cntk_device(device_id))

    if not is_1bit_sgd:
        pytest.skip('test only runs in 1-bit SGD')

    try:
        base_path = os.path.join(
            os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
            *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
    except KeyError:
        base_path = os.path.join(
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1)
    set_fixed_random_seed(
        1
    )  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    distributed_trainer = distributed.data_parallel_distributed_trainer(
        num_quantization_bits=32, distributed_after=0)

    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'),
                                 os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                 True)
    reader_test = create_reader(os.path.join(base_path, 'test_map.txt'),
                                os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                False)

    test_error = train_and_evaluate(reader_train, reader_test, 'resnet20', 5,
                                    distributed_trainer)

    expected_test_error = 0.282

    assert np.allclose(test_error,
                       expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: cifar_resnet_distributed_test.py Proyecto: jplu/CNTK

def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd):
    if cntk_device(device_id).type() != DeviceKind_GPU:
        pytest.skip('test only runs on GPU')
    set_default_device(cntk_device(device_id))

    if not is_1bit_sgd:
        pytest.skip('test only runs in 1-bit SGD')

    try:
        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
    except KeyError:
        base_path = os.path.join(
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1) 
    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    distributed_trainer = distributed.data_parallel_distributed_trainer(
        num_quantization_bits=32,
        distributed_after=0)

    reader_train = create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True)
    reader_test  = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False)

    test_error = train_and_evaluate(reader_train, reader_test, 'resnet20', 5, distributed_trainer)

    expected_test_error = 0.282

    assert np.allclose(test_error, expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: CifarResNet_Distributed.py Proyecto: zhongkaifu/CNTK

def cifar_resnet_distributed(data_path,
                             run_test,
                             num_epochs,
                             communicator=None,
                             save_model_filename=None,
                             load_model_filename=None,
                             debug_output=False):
    image_height = 32
    image_width = 32
    num_channels = 3
    num_classes = 10

    feats_stream_name = 'features'
    labels_stream_name = 'labels'

    minibatch_source = create_reader(os.path.join(data_path, 'train_map.txt'),
                                     os.path.join(data_path,
                                                  'CIFAR-10_mean.xml'),
                                     True,
                                     distributed_communicator=communicator)

    features_si = minibatch_source[feats_stream_name]
    labels_si = minibatch_source[labels_stream_name]

    # Instantiate the resnet classification model, or load from file

    if load_model_filename:
        print("Loading model:", load_model_filename)
        classifier_output = persist.load_model(load_model_filename)
        image_input = classifier_output.arguments[0]
    else:
        image_input = input_variable((num_channels, image_height, image_width),
                                     features_si.m_element_type)
        classifier_output = create_resnet_model(image_input, num_classes)

    # Input variables denoting the features and label data
    label_var = input_variable((num_classes), features_si.m_element_type)

    ce = cross_entropy_with_softmax(classifier_output, label_var)
    pe = classification_error(classifier_output, label_var)

    # Instantiate the trainer object to drive the model training

    mb_size = 128
    num_mb_per_epoch = 100

    num_mbs = num_mb_per_epoch * num_epochs

    lr_schedule = [1.0 / mb_size] * 80 + [0.1 / mb_size] * 40 + [
        0.01 / mb_size
    ]
    lr_per_minibatch = learning_rate_schedule(lr_schedule, UnitType.minibatch,
                                              mb_size * num_mb_per_epoch)
    momentum_time_constant = momentum_as_time_constant_schedule(-mb_size /
                                                                np.log(0.9))

    # create data parallel distributed trainer if needed
    dist_trainer = distributed.data_parallel_distributed_trainer(
        communicator, False) if communicator else None

    # Instantiate the trainer object to drive the model training
    trainer = Trainer(classifier_output,
                      ce,
                      pe, [
                          momentum_sgd(classifier_output.parameters,
                                       lr=lr_per_minibatch,
                                       momentum=momentum_time_constant,
                                       l2_regularization_weight=0.0001)
                      ],
                      distributed_trainer=dist_trainer)

    # Get minibatches of images to train with and perform model training
    training_progress_output_freq = 100 if communicator else 20

    if debug_output:
        training_progress_output_freq = training_progress_output_freq / 4

    for i in range(0, num_mbs):

        # NOTE: depends on network, the mb_size can be changed dynamically here
        mb = minibatch_source.next_minibatch(mb_size)

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be trained with
        arguments = {image_input: mb[features_si], label_var: mb[labels_si]}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)

    if save_model_filename:
        print("Saving model:", save_model_filename)
        persist.save_model(classifier_output, save_model_filename)

    if run_test:
        test_minibatch_source = create_reader(
            os.path.join(data_path, 'test_map.txt'),
            os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
        features_si = test_minibatch_source[feats_stream_name]
        labels_si = test_minibatch_source[labels_stream_name]

        mb_size = 128
        num_mbs = 100

        total_error = 0.0
        for i in range(0, num_mbs):
            mb = test_minibatch_source.next_minibatch(mb_size)

            # Specify the mapping of input variables in the model to actual
            # minibatch data to be trained with
            arguments = {
                image_input: mb[features_si],
                label_var: mb[labels_si]
            }
            error = trainer.test_minibatch(arguments)
            total_error += error

        return total_error / num_mbs
    else:
        return 0

Ejemplo n.º 5

0

Mostrar archivo

        metric_numer += trainer.test_minibatch(data) * current_minibatch
        metric_denom += current_minibatch
        # Keep track of the number of samples processed so far.
        sample_count += trainer.previous_minibatch_sample_count
        minibatch_index += 1

    print("")
    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(
        minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
        metric_denom))
    print("")

    return metric_numer / metric_denom


if __name__ == '__main__':
    distributed_after_samples = 0
    num_quantization_bits = 32
    distributed_trainer = distributed.data_parallel_distributed_trainer(
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)

    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'),
                                 os.path.join(data_path, 'CIFAR-10_mean.xml'),
                                 True, distributed_after_samples)
    reader_test = create_reader(os.path.join(data_path, 'test_map.txt'),
                                os.path.join(data_path, 'CIFAR-10_mean.xml'),
                                False)

    convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer)
    distributed.Communicator.finalize()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: CifarResNet_Distributed.py Proyecto: shadrack4292/CNTK

def cifar_resnet_distributed(data_path, run_test, num_epochs, communicator=None, save_model_filename=None, load_model_filename=None, debug_output=False):
    image_height = 32
    image_width = 32
    num_channels = 3
    num_classes = 10

    feats_stream_name = 'features'
    labels_stream_name = 'labels'

    minibatch_source = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True,
                                     distributed_communicator = communicator)

    features_si = minibatch_source[feats_stream_name]
    labels_si = minibatch_source[labels_stream_name]

    # Instantiate the resnet classification model, or load from file
    
    if load_model_filename:
        print("Loading model:", load_model_filename)
        classifier_output = persist.load_model(load_model_filename)
        image_input = classifier_output.arguments[0]
    else:
        image_input = input_variable(
            (num_channels, image_height, image_width), features_si.m_element_type)
        classifier_output = create_resnet_model(image_input, num_classes)

    # Input variables denoting the features and label data
    label_var = input_variable((num_classes), features_si.m_element_type)

    ce = cross_entropy_with_softmax(classifier_output, label_var)
    pe = classification_error(classifier_output, label_var)

    # Instantiate the trainer object to drive the model training

    mb_size = 128
    num_mb_per_epoch = 100
    
    num_mbs = num_mb_per_epoch * num_epochs

    lr_per_sample = [1/mb_size]*80+[0.1/mb_size]*40+[0.01/mb_size]
    lr_schedule = learning_rate_schedule(lr_per_sample, units = mb_size * num_mb_per_epoch)
    momentum_time_constant = -mb_size/np.log(0.9)

    # create data parallel distributed trainer if needed
    dist_trainer = distributed.data_parallel_distributed_trainer(communicator, False) if communicator else None

    # Instantiate the trainer object to drive the model training
    trainer = Trainer(classifier_output, ce, pe,
                      [momentum_sgd(classifier_output.parameters, lr_schedule, momentum_time_constant, l2_regularization_weight=0.0001)],
                      distributed_trainer = dist_trainer)
    
    # Get minibatches of images to train with and perform model training
    training_progress_output_freq = 100 if communicator else 20

    if debug_output:
        training_progress_output_freq = training_progress_output_freq/4
        
    for i in range(0, num_mbs):
    
        # NOTE: depends on network, the mb_size can be changed dynamically here
        mb = minibatch_source.next_minibatch(mb_size)

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be trained with
        arguments = {
                image_input: mb[features_si], 
                label_var: mb[labels_si]
                }
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)
        
    if save_model_filename:
        print("Saving model:", save_model_filename)
        persist.save_model(classifier_output, save_model_filename)

    if run_test:
        test_minibatch_source = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
        features_si = test_minibatch_source[feats_stream_name]
        labels_si = test_minibatch_source[labels_stream_name]

        mb_size = 128
        num_mbs = 100

        total_error = 0.0
        for i in range(0, num_mbs):
            mb = test_minibatch_source.next_minibatch(mb_size)

            # Specify the mapping of input variables in the model to actual
            # minibatch data to be trained with
            arguments = {
                    image_input: mb[features_si], 
                    label_var: mb[labels_si]
                    }
            error = trainer.test_minibatch(arguments)
            total_error += error

        return total_error / num_mbs
    else:
        return 0

Ejemplo n.º 7

0

Mostrar archivo

Archivo: ConvNet_CIFAR10_DataAug_Distributed.py Proyecto: jplu/CNTK

    while sample_count < epoch_size:
        current_minibatch = min(minibatch_size, epoch_size - sample_count)
        # Fetch next test min batch.
        data = reader_test.next_minibatch(current_minibatch, input_map=input_map)
        # minibatch data to be trained with
        metric_numer += trainer.test_minibatch(data) * current_minibatch
        metric_denom += current_minibatch
        # Keep track of the number of samples processed so far.
        sample_count += trainer.previous_minibatch_sample_count
        minibatch_index += 1

    print("")
    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom))
    print("")

    return metric_numer/metric_denom

if __name__=='__main__':
    distributed_after_samples = 0
    num_quantization_bits = 32
    distributed_trainer = distributed.data_parallel_distributed_trainer(
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)

    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True, distributed_after_samples)
    reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)

    convnet_cifar10_dataaug(reader_train, reader_test, distributed_trainer)
    distributed.Communicator.finalize()

Ejemplo n.º 8

0

Mostrar archivo

                                          total_number_of_samples=max_epochs *
                                          epoch_size)
    test_source = create_image_mb_source(
        test_data,
        mean_data,
        train=False,
        total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source,
                   progress_printer, epoch_size)


if __name__ == '__main__':
    distributed_after_samples = 0
    num_quantization_bits = 32
    distributed_trainer = distributed.data_parallel_distributed_trainer(
        communicator=cntk_py.mpicommunicator(),
        use_async_buffered_parameter_update=False)

    mean_data = os.path.join(data_path, 'CIFAR-10_mean.xml')
    train_data = os.path.join(data_path, 'train_map.txt')
    test_data = os.path.join(data_path, 'test_map.txt')

    convnet_cifar10_dataaug(train_data,
                            test_data,
                            mean_data,
                            num_quantization_bits=32,
                            max_epochs=80,
                            log_to_file=log_dir,
                            num_mbs_per_log=10)
    Communicator.finalize()