예제 #1
0
def train_and_evaluate(data_path, total_epochs, gpu_count=1):
    # Create distributed communicator for 1-bit SGD for better scaling to multiple GPUs
    # If you'd like to avoid quantization loss, use simple one instead
    quantization_bit = 1

    if (quantization_bit == 32):
        communicator = distributed.mpi_communicator()
    else:
        communicator = distributed.quantized_mpi_communicator(quantization_bit)

    workers = communicator.workers()
    current_worker = communicator.current_worker()
    print("List all distributed workers")
    for wk in workers:
        if current_worker.global_rank == wk.global_rank:
            print("* {} {}".format(wk.global_rank, wk.host_id))
        else:
            print("  {} {}".format(wk.global_rank, wk.host_id))

    if gpu_count == 1 and len(workers) > 1:
        print("Warning: running distributed training on 1-GPU will be slow")
        device.set_default_device(gpu(0))

    print("Training on device type:{} id:{}".format(
        'gpu' if device.default().type() else 'cpu',
        device.default().id()))

    start_model = "start_model.bin"
    num_start_epochs = 1
    num_parallel_epochs = total_epochs - num_start_epochs

    # training the start model only in one worker
    if communicator.current_worker().global_rank == 0:
        cifar_resnet_distributed(data_path,
                                 save_model_filename=start_model,
                                 communicator=None,
                                 run_test=False,
                                 num_epochs=num_start_epochs)

    communicator.barrier()

    # train in parallel
    error = cifar_resnet_distributed(data_path,
                                     load_model_filename=start_model,
                                     communicator=communicator,
                                     run_test=True,
                                     num_epochs=num_parallel_epochs)

    distributed.Communicator.finalize()
    return error
예제 #2
0
파일: utils_test.py 프로젝트: hahatt/CNTK
def test_get_data_type():
    assert get_data_type(constant(value=2), constant(value=1)) == np.float32
    assert get_data_type(input_variable(shape=(2,3)), constant(value=1)) == np.float32

    ndav32 = create_NDArrayView_from_NumPy(np.asarray([[1,2]], dtype=np.float32))
    assert get_data_type(input_variable(shape=(2,3), data_type=np.float64),
            ndav32) == np.float64

    ndav64 = create_NDArrayView_from_NumPy(np.asarray([[1,2]],
        dtype=np.float64))
    assert get_data_type(input_variable(shape=(2,3), data_type=np.float64),
            ndav64) == np.float64

    val32 = create_Value_from_NumPy(np.asarray([[1,2]], dtype=np.float32),
            dev = default())
    assert get_data_type(val32, ndav64) == np.float64
예제 #3
0
def test_get_data_type():
    assert get_data_type(constant(value=2), constant(value=1)) == np.float32
    assert get_data_type(input_variable(shape=(2, 3)),
                         constant(value=1)) == np.float32

    ndav32 = create_NDArrayView_from_NumPy(
        np.asarray([[1, 2]], dtype=np.float32))
    assert get_data_type(input_variable(shape=(2, 3), data_type=np.float64),
                         ndav32) == np.float64

    ndav64 = create_NDArrayView_from_NumPy(
        np.asarray([[1, 2]], dtype=np.float64))
    assert get_data_type(input_variable(shape=(2, 3), data_type=np.float64),
                         ndav64) == np.float64

    val32 = create_Value_from_NumPy(np.asarray([[1, 2]], dtype=np.float32),
                                    dev=default())
    assert get_data_type(val32, ndav64) == np.float64
예제 #4
0
def train_and_evaluate(data_path, total_epochs, gpu_count=1):
    # Create distributed communicator for 1-bit SGD for better scaling to multiple GPUs
    # If you'd like to avoid quantization loss, use simple one instead
    quantization_bit = 1

    if (quantization_bit == 32):
        communicator = distributed.mpi_communicator()
    else:
        communicator = distributed.quantized_mpi_communicator(quantization_bit)

    workers = communicator.workers()
    current_worker = communicator.current_worker()
    print("List all distributed workers")
    for wk in workers:
        if current_worker.global_rank == wk.global_rank:
            print("* {} {}".format(wk.global_rank, wk.host_id))
        else:
            print("  {} {}".format(wk.global_rank, wk.host_id))

    if gpu_count == 1 and len(workers) > 1 :
        print("Warning: running distributed training on 1-GPU will be slow")
        device.set_default_device(gpu(0))

    print("Training on device type:{} id:{}".format('gpu' if device.default().type() else 'cpu', device.default().id()))

    start_model = "start_model.bin"
    num_start_epochs = 1
    num_parallel_epochs = total_epochs - num_start_epochs

    # training the start model only in one worker
    if communicator.current_worker().global_rank == 0:
        cifar_resnet_distributed(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs)
    
    communicator.barrier()
    
    # train in parallel
    error = cifar_resnet_distributed(data_path, load_model_filename=start_model, communicator=communicator, run_test=True, num_epochs=num_parallel_epochs)

    distributed.Communicator.finalize()
    return error
예제 #5
0
        *"../../../../Examples/Image/DataSets/CIFAR-10/".split("/"))))

    os.chdir(data_path)

    # Create distributed communicator for 1-bit SGD
    communicator = distributed.communicator(distributed.quantized_mpi_communicator(1))
    workers = communicator.workers()
    current_worker = communicator.current_worker()
    print("List all distributed workers")
    for wk in workers:
        if current_worker.global_rank == wk.global_rank:
            print("* {} {}".format(wk.global_rank, wk.host_id))
        else:
            print("  {} {}".format(wk.global_rank, wk.host_id))

    print("Training on device type:{} id:{}".format('gpu' if device.default().type() else 'cpu', device.default().id()))

    start_model = "start_model.bin"
    num_start_epochs = 1
    num_parallel_epochs = 10

    # training the start model only in one worker
    if communicator.current_worker().global_rank == 0:
        cifar_resnet(data_path, save_model_filename=start_model, communicator=None, run_test=False, num_epochs=num_start_epochs)
    
    communicator.barrier()
    
    # train in parallel
    error = cifar_resnet(data_path, load_model_filename=start_model, communicator=communicator, run_test=True, num_epochs=num_parallel_epochs)
    
    print("Error: %f" % error)
예제 #6
0
    os.chdir(data_path)

    # Create distributed communicator for 1-bit SGD
    communicator = distributed.communicator(
        distributed.quantized_mpi_communicator(1))
    workers = communicator.workers()
    current_worker = communicator.current_worker()
    print("List all distributed workers")
    for wk in workers:
        if current_worker.global_rank == wk.global_rank:
            print("* {} {}".format(wk.global_rank, wk.host_id))
        else:
            print("  {} {}".format(wk.global_rank, wk.host_id))

    print("Training on device type:{} id:{}".format(
        'gpu' if device.default().type() else 'cpu',
        device.default().id()))

    start_model = "start_model.bin"
    num_start_epochs = 1
    num_parallel_epochs = 10

    # training the start model only in one worker
    if communicator.current_worker().global_rank == 0:
        cifar_resnet(data_path,
                     save_model_filename=start_model,
                     communicator=None,
                     run_test=False,
                     num_epochs=num_start_epochs)

    communicator.barrier()