Esempio n. 1
0
def test_cifar_resnet_distributed_1bitsgd(device_id):
    params = [
        "-e", "2", "-datadir", base_path, "-q", "1", "-es", "512", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.86,
                 False, 3)
def test_cifar_resnet_distributed_1bitsgd(device_id):
    params = [ "-e", "2",
               "-datadir", base_path,
               "-q", "1",
               "-es", "512",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.86, False, 3)
Esempio n. 3
0
def test_cifar_convnet_distributed(device_id):
    # Create a path to TensorBoard log directory and make sure it does not exist.
    abs_path = os.path.dirname(os.path.abspath(__file__))
    tb_logdir = os.path.join(abs_path,
                             'ConvNet_CIFAR10_DataAug_Distributed_test_log')
    if os.path.exists(tb_logdir):
        shutil.rmtree(tb_logdir)

    params = [
        "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path,
        "-tensorboard_logdir", tb_logdir, "-q", "32", "-r", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id,
                 script_under_test,
                 mpiexec_params,
                 params,
                 0.75,
                 False,
                 per_minibatch_tolerance=1e-2
                 )  # False since different workers may have different #cores

    # Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name.
    tb_files = 0
    for tb_file in os.listdir(tb_logdir):
        assert tb_file.startswith("events.out.tfevents")
        tb_files += 1
    assert tb_files == 1
def test_cifar_resnet_distributed_block_momentum(device_id):
    params = [ "-e", "2",
               "-datadir", base_path,
               "-b", "3200",
               "-es", "512",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.89, False, 5)
Esempio n. 5
0
def test_htk_lstm_truncated_distributed_gpu(device_id):
    params = [ "-n", "3",
               "-datadir", an4_dataset_directory(),
               "-q", "1",
               "-m", "640",
               "-e", "1000",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.76, True)
Esempio n. 6
0
def test_cifar_convnet_distributed_block_momentum(device_id):
    params = [
        "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-b",
        "1600", "-r", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.78,
                 False, 10)
Esempio n. 7
0
def test_cifar_resnet_distributed(device_id):
    params = [
        "-e", "2", "-datadir",
        prepare_CIFAR10_data(), "-q", "32", "-es", "512", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.86,
                 False, 3)
def test_cifar_resnet_distributed_block_momentum(device_id):
    params = [
        "-e", "2", "-datadir", base_path, "-b", "3200", "-es", "512", "-r",
        "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.89,
                 False, 5)
def test_alexnet_imagenet_distributed_block_momentum(device_id):
    params = [
        "-n", "2", "-m", "8", "-e", "16", "-datadir",
        prepare_ImageNet_data(), "-b", "1600", "-r", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99,
                 False)
Esempio n. 10
0
def test_sequence_to_sequence_distributed_block_momentum(device_id):
    params = [ "-e", "4",
               "-datadir", cmudict_dataset_directory(),
               "-ms", "100",
               "-es", "1000",
               "-b", "3200",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.8612, False, 1, 2E-2)
Esempio n. 11
0
def test_sequence_to_sequence_distributed_gpu(device_id):
    params = [ "-e", "2",
               "-datadir", cmudict_dataset_directory(),
               "-q", "1",
               "-ms", "100",
               "-es", "500",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.8625, False, 0, 2E-2)
def test_bn_inception_imagenet_distributed(device_id):
    params = [
        "-n", "4", "-datadir",
        prepare_ImageNet_data(), "-q", "32", "-e", "300", "-m", "2", "-r",
        "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99,
                 True)
def test_bn_inception_cifar_distributed(device_id):
    params = [ "-n", "8",
               "-datadir", prepare_CIFAR10_data(),
               "-q", "32",
               "-e", "500",
               "-m", "16",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.88, True)
def test_bn_inception_cifar_distributed(device_id):
    params = [
        "-n", "8", "-datadir",
        prepare_CIFAR10_data(), "-q", "32", "-e", "500", "-m", "16", "-r",
        "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.88,
                 True)
Esempio n. 15
0
def test_cifar_convnet_distributed_1bitsgd(device_id):
    params = [ "-n", "2",
               "-m", "64",
               "-e", "3200",
               "-datadir", prepare_CIFAR10_data(),
               "-q", "1",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2)
def test_alexnet_imagenet_distributed_1bitsgd(device_id):
    params = [ "-n", "2",
               "-datadir", prepare_ImageNet_data(),
               "-q", "1",
               "-m", "8",
               "-e", "16",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True)
def disabled_test_alexnet_imagenet_distributed_block_momentum(device_id):
    params = [ "-n", "2",
               "-m", "8",
               "-e", "16",
               "-datadir", prepare_ImageNet_data(),
               "-b", "1600",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, False)
def test_htk_lstm_truncated_distributed_block_momentum(device_id):

    params = [
        "-n", "3", "-m", "640", "-e", "1000", "-datadir",
        an4_dataset_directory(), "-b", "1600", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.76,
                 False, 4)
def test_cifar_convnet_distributed_block_momentum(device_id):
    params = [ "-n", "2",
               "-m", "64",
               "-e", "3200",
               "-datadir", base_path,
               "-b", "1600",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.78, False, 10)
def test_bn_inception_imagenet_distributed(device_id):
    params = [ "-n", "4",
               "-datadir", prepare_ImageNet_data(),
               "-q", "32",
               "-e", "300",
               "-m", "2",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=400)
def test_alexnet_imagenet_distributed_1bitsgd(device_id):
    params = [
        "-n", "2", "-datadir",
        prepare_ImageNet_data(), "-q", "1", "-m", "8", "-e", "16", "-r",
        "-device",
        str(device_id)
    ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99,
                 True)
def test_cifar_convnet_distributed_gpu(device_id):
    params = [ "-n", "2",
               "-m", "64",
               "-e", "3200",
               "-datadir", base_path,
               "-q", "1",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2)
def test_htk_lstm_truncated_distributed_block_momentum(device_id):

    params = [ "-n", "3",
               "-m", "640",
               "-e", "1000",
               "-datadir", an4_dataset_directory(),
               "-b", "1600",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.76, False, 4)
def test_VGG16_imagenet_distributed(device_id):
    params = [ "-n", "2",
               "-m", "2",
               "-e", "2",
               "-datadir", prepare_ImageNet_data(),
               "-q", "32",
               "-device", str(device_id),
               "-r",
               "-testing"]

    # Currently we only test for CPU since the memory usage is very high for GPU (~6 GB)
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=500, use_only_cpu=True)
Esempio n. 25
0
def test_cifar_convnet_distributed_gpu(device_id):
    params = [
        "-n", "2", "-m", "64", "-e", "3200", "-datadir", base_path, "-q", "1",
        "-r", "-device",
        str(device_id)
    ]
    mpiexec_test(device_id,
                 script_under_test,
                 mpiexec_params,
                 params,
                 0.75,
                 False,
                 per_minibatch_tolerance=1e-2)
def test_inception_v3_imagenet_distributed(device_id):
    # Inception-V3 distributed test on ImageNet need plenty of memory,
    # for now, the test server might feel hard to handle
    pytest.skip('Mute Inception-V3 distributed test temporarily')

    params = ["-n", "2",
              "-datadir", prepare_ImageNet_data(),
              "-q", "32",
              "-e", "200",
              "-m", "2",
              "-r",
              "-device", str(device_id)]

    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.99, True, timeout_seconds=400)
Esempio n. 27
0
def test_VGG19_imagenet_distributed(device_id):
    params = [
        "-n", "2", "-m", "2", "-e", "2", "-datadir",
        prepare_ImageNet_data(), "-q", "32", "-device",
        str(device_id), "-r", "-testing"
    ]

    # Currently we only test for CPU since the memory usage is very high for GPU (~6 GB)
    mpiexec_test(device_id,
                 script_under_test,
                 mpiexec_params,
                 params,
                 0.99,
                 True,
                 timeout_seconds=500,
                 use_only_cpu=True)
Esempio n. 28
0
def test_inception_v3_imagenet_distributed(device_id):
    # Inception-V3 distributed test on ImageNet need plenty of memory,
    # for now, the test server might feel hard to handle
    pytest.skip('Mute Inception-V3 distributed test temporarily')

    params = [
        "-n", "2", "-datadir",
        prepare_ImageNet_data(), "-q", "32", "-e", "200", "-m", "2", "-r",
        "-device",
        str(device_id)
    ]

    mpiexec_test(device_id,
                 script_under_test,
                 mpiexec_params,
                 params,
                 0.99,
                 True,
                 timeout_seconds=400)
def test_cifar_convnet_distributed(device_id):
    # Create a path to TensorBoard log directory and make sure it does not exist.
    abs_path = os.path.dirname(os.path.abspath(__file__))
    tb_logdir = os.path.join(abs_path, 'ConvNet_CIFAR10_DataAug_Distributed_test_log')
    if os.path.exists(tb_logdir):
        shutil.rmtree(tb_logdir)

    params = [ "-n", "2",
               "-m", "64",
               "-e", "3200",
               "-datadir", base_path,
               "-tensorboard_logdir", tb_logdir,
               "-q", "32",
               "-r",
               "-device", str(device_id) ]
    mpiexec_test(device_id, script_under_test, mpiexec_params, params, 0.75, False, per_minibatch_tolerance=1e-2) # False since different workers may have different #cores

    # Ensure that the TensorBoard log directory was created and contains exactly one file with the expected name.
    tb_files = 0
    for tb_file in os.listdir(tb_logdir):
        assert tb_file.startswith("events.out.tfevents")
        tb_files += 1
    assert tb_files == 1