def convnet_cifar10_dataaug(train_data,
                            test_data,
                            mean_data,
                            num_quantization_bits=32,
                            max_epochs=2,
                            log_to_file=None,
                            num_mbs_per_log=None,
                            gen_heartbeat=False):
    _cntk_py.set_computation_network_trace_level(0)

    epoch_size = 50000

    progress_printer = ProgressPrinter(freq=num_mbs_per_log,
                                       tag='Training',
                                       log_to_file=log_to_file,
                                       rank=Communicator.rank(),
                                       gen_heartbeat=gen_heartbeat,
                                       num_epochs=max_epochs)

    network = create_conv_network()
    trainer = create_trainer(network, epoch_size, num_quantization_bits)
    train_source = create_image_mb_source(train_data,
                                          mean_data,
                                          train=True,
                                          total_number_of_samples=max_epochs *
                                          epoch_size)
    test_source = create_image_mb_source(
        test_data,
        mean_data,
        train=False,
        total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source,
                   progress_printer, epoch_size)
Exemple #2
0
def vgg19_train_and_eval(train_data,
                         test_data,
                         num_quantization_bits=32,
                         minibatch_size=128,
                         epoch_size=1281167,
                         max_epochs=80,
                         restore=True,
                         log_to_file=None,
                         num_mbs_per_log=None,
                         gen_heartbeat=False):
    _cntk_py.set_computation_network_trace_level(0)

    progress_printer = ProgressPrinter(freq=num_mbs_per_log,
                                       tag='Training',
                                       log_to_file=log_to_file,
                                       rank=Communicator.rank(),
                                       gen_heartbeat=gen_heartbeat,
                                       num_epochs=max_epochs)

    network = create_vgg19()
    trainer = create_trainer(network, epoch_size, num_quantization_bits,
                             progress_printer)
    train_source = create_image_mb_source(train_data,
                                          True,
                                          total_number_of_samples=max_epochs *
                                          epoch_size)
    test_source = create_image_mb_source(
        test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, minibatch_size,
                   epoch_size, restore)
def resnet_cifar10(train_data,
                   test_data,
                   mean_data,
                   network_name,
                   epoch_size,
                   num_quantization_bits=32,
                   block_size=3200,
                   warm_up=0,
                   max_epochs=5,
                   restore=True,
                   log_to_file=None,
                   num_mbs_per_log=None,
                   gen_heartbeat=False,
                   scale_up=False,
                   profiling=False):

    set_computation_network_trace_level(0)

    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minimatch size on the same number of samples means less updates,
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    minibatch_size = 128 * (Communicator.num_workers() if scale_up else 1)

    progress_printer = ProgressPrinter(freq=num_mbs_per_log,
                                       tag='Training',
                                       log_to_file=log_to_file,
                                       rank=Communicator.rank(),
                                       gen_heartbeat=gen_heartbeat,
                                       num_epochs=max_epochs)

    network = create_resnet_network(network_name)
    trainer = create_trainer(network, minibatch_size, epoch_size,
                             num_quantization_bits, block_size, warm_up,
                             progress_printer)
    train_source = create_image_mb_source(train_data,
                                          mean_data,
                                          train=True,
                                          total_number_of_samples=max_epochs *
                                          epoch_size)
    test_source = create_image_mb_source(
        test_data,
        mean_data,
        train=False,
        total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, minibatch_size,
                   epoch_size, profiling)
def bn_inception_train_and_eval(train_data, test_data, mean_data, num_quantization_bits=32, epoch_size=50000, max_epochs=200, 
                         restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
    _cntk_py.set_computation_network_trace_level(0)

    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
    # up. However, bigger minimatch size on the same number of samples means less updates, 
    # thus leads to higher training error. This is a trade-off of speed and accuracy
    minibatch_size = 128 * (Communicator.num_workers() if scale_up else 1)

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

    network = create_bn_inception()
    trainer = create_trainer(network, epoch_size, max_epochs, minibatch_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, False, total_number_of_samples=FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling)
def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80, 
                         restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
    _cntk_py.set_computation_network_trace_level(0)

    progress_printer = ProgressPrinter(
        freq=num_mbs_per_log,
        tag='Training',
        log_to_file=log_to_file,
        rank=Communicator.rank(),
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

    network = create_vgg19()
    trainer = create_trainer(network, epoch_size, num_quantization_bits)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
    train_data = os.path.join(data_path, 'train_map.txt')
    test_data = os.path.join(data_path, 'test_map.txt')

    num_quantization_bits = args['quantized_bits']
    epochs = args['epochs']
    warm_up = args['distributed_after']
    network_name = args['network']
    scale_up = bool(args['scale_up'])

    # Create distributed trainer factory
    print(
        "Start training: quantize_bit = {}, epochs = {}, distributed_after = {}"
        .format(num_quantization_bits, epochs, warm_up))

    try:
        resnet_cifar10(train_data,
                       test_data,
                       mean_data,
                       network_name,
                       epoch_size,
                       num_quantization_bits,
                       block_size=args['block_samples'],
                       warm_up=args['distributed_after'],
                       max_epochs=epochs,
                       scale_up=scale_up,
                       log_to_file=args['logdir'],
                       profiling=args['profile'])
    finally:
        # Must call MPI finalize when process exit
        Communicator.finalize()
Exemple #7
0
 def finalize(self):
     if self._distributed:
         Communicator.finalize()
    if args['epoch_size'] is not None:
        epoch_size = args['epoch_size']

    mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')
    train_data=os.path.join(data_path, 'train_map.txt')
    test_data=os.path.join(data_path, 'test_map.txt')

    num_quantization_bits = args['quantized_bits']
    epochs = args['epochs']
    warm_up = args['distributed_after']
    network_name = args['network']
    scale_up = bool(args['scale_up'])

    # Create distributed trainer factory
    print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, warm_up))

    try:
        resnet_cifar10(train_data, test_data, mean_data,
                       network_name, 
                       epoch_size,
                       num_quantization_bits,
                       block_size=args['block_samples'],
                       warm_up=args['distributed_after'],
                       max_epochs=epochs,
                       scale_up=scale_up,
                       log_to_file=args['logdir'],
                       profiling=args['profile'])
    finally:
        # Must call MPI finalize when process exit
        Communicator.finalize()