Example #1
0
def skeleton_jag_reconstruction_loss(cluster, dir_name, weekly,
                                     data_reader_percent):
    output_file_name = '%s/ci_test/unit_tests/output/jag_reconstruction_loss_output.txt' % (
        dir_name)
    error_file_name = '%s/ci_test/unit_tests/error/jag_reconstruction_loss_error.txt' % (
        dir_name)
    command = tools.get_command(
        cluster=cluster,
        num_nodes=2,
        num_processes=32,
        disable_cuda=1,
        dir_name=dir_name,
        sample_list_train_default=
        '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt',
        sample_list_test_default=
        '/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K16trainers/t1_sample_list.txt',
        data_reader_name='jag',
        data_reader_percent='prototext',
        metadata='applications/physics/data/jag_100M_metadata.prototext',
        model_folder='tests',
        model_name='jag_single_layer_ae',
        optimizer_name='adam',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
Example #2
0
def build_skeleton(dir_name, compiler, debug):
    compiler_underscored = re.sub('[@\.]', '_', compiler)
    if debug:
        build_type = 'debug'
    else:
        build_type = 'rel'
    output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_build_output.txt' % (dir_name, compiler_underscored, build_type)
    error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_build_error.txt' % (dir_name, compiler_underscored, build_type)
    compiler = compiler.replace('@', '-')
    #mpi_lib = mpi_lib.replace('@', '-')
    cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).decode('utf-8').strip())
    # For reference:
    # Commenting out for now. These additions to path name will likely return
    # one day, so I am not removing them entirely.
    # x86_64 <=> catalyst, pascal
    # ppc64le <=> ray
    #architecture = subprocess.check_output('uname -m'.split()).decode('utf-8').strip()
    #if cluster == 'ray':
    #    architecture += '_gpu_cuda-9.2.64_cudnn-7.0'
    #elif cluster == 'pascal':
    #    architecture += '_gpu_cuda-9.1.85_cudnn-7.1'
    os.chdir('%s/bamboo/compiler_tests/builds/%s_%s_%s/build' % (dir_name, cluster, compiler, build_type))
    command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name)
    return_code = os.system(command)
    os.chdir('../..')
    tools.assert_success(return_code, error_file_name)
Example #3
0
def test_compiler_build_script(cluster, dirname):
    test_base_dir = os.path.join(dirname, 'ci_test', 'compiler_tests')
    output_file_name = os.path.join(test_base_dir, 'output', 'build_script_output.txt')
    error_file_name = os.path.join(test_base_dir, 'error', 'build_script_error.txt')

    # Get environment variables
    ENV_NAME = os.getenv('SPACK_ENV_NAME')

    common_cmd = '%s/scripts/build_lbann.sh -d -l %s --test --clean-build -j $(($(nproc)+2)) -- +deterministic +vision +numpy' % (dirname, ENV_NAME)
    if cluster in ['lassen', 'pascal', 'ray']:
        command = '%s +cuda +half +fft > %s 2> %s' % (common_cmd, output_file_name, error_file_name)
    elif cluster in ['corona']:
        command = '%s +rocm > %s 2> %s' % (common_cmd, output_file_name, error_file_name)
    elif cluster in ['catalyst']:
        command = '%s +onednn +half +fft > %s 2> %s' % (common_cmd, output_file_name, error_file_name)
    else:
        e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster
        print('Skip - ' + e)
        pytest.skip(e)

    return_code = os.system(command)

    artifact_dir = os.path.join(test_base_dir, 'output')
    with os.scandir(dirname) as it:
        for entry in it:
            if entry.is_file() and re.match(r'spack-.*txt', entry.name):
                (base, ext) = os.path.splitext(entry.name)
                new_file_name = base + '_output' + ext
                shutil.copyfile(entry.path, os.path.join(artifact_dir, new_file_name))

    tools.assert_success(return_code, error_file_name)
Example #4
0
def test_compiler_build_script(cluster, dirname):
    if cluster not in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']:
        e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster
        print('Skip - ' + e)
        pytest.skip(e)
    output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname)
    error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname)
    command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % (
        dirname, output_file_name, error_file_name)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
Example #5
0
def build_script(cluster, dirname, compiler, debug):
    print(('Running build_script for cluster={cluster},'
           ' compiler={compiler}, debug={debug}.').format(
        cluster=cluster, compiler=compiler, debug=debug))
    if debug:
        build = 'debug'
        debug_flag = '--debug'
    else:
        build = 'release'
        debug_flag = ''
    output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_%s_build_script_output.txt' % (dirname, cluster, compiler, build)
    error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build)
    command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
Example #6
0
def test_run_parallel_filesystem_catch_tests(cluster, dirname):
    output_dir = os.path.join(dirname, 'ci_test', 'unit_tests')
    build_dir = hack_find_spack_build_dir(dirname)
    mpi_catch_exe = os.path.join(build_dir, 'unit_test', 'mpi-catch-tests')
    if not os.path.exists(mpi_catch_exe):
        print('Skip - executable not found')
        pytest.skip('executable not found')
    # Run the parallel tests
    mpi_launch = get_system_mpi_launch(cluster)
    mpi_output_file_name = 'mpi_filesystem_catch_tests_output-%s-rank=%%r-size=%%s.xml' % (cluster)
    mpi_output_file = os.path.join(output_dir, mpi_output_file_name)
    mpi_error_file = os.path.join(output_dir, "error", "mpi-filesystem-catch-test-error.log")
    mpi_catch_args = [mpi_catch_exe, '"[filesystem]"', '-r', 'junit', '-o', mpi_output_file]
    output = sp.run(mpi_launch + mpi_catch_args)
    tools.assert_success(output.returncode, mpi_error_file)
Example #7
0
def test_run_sequential_catch_tests(cluster, dirname):
    output_dir = os.path.join(dirname, 'ci_test', 'unit_tests')
    build_dir = hack_find_spack_build_dir(dirname)
    seq_catch_exe = os.path.join(build_dir, 'unit_test', 'seq-catch-tests')
    if not os.path.exists(seq_catch_exe):
        print('Skip - executable not found')
        pytest.skip('executable not found')
    # Run the sequential tests
    seq_launch = get_system_seq_launch(cluster)
    seq_output_file_name = 'seq_catch_tests_output-%s.xml' % (cluster)
    seq_output_file = os.path.join(output_dir, seq_output_file_name)
    seq_error_file = os.path.join(output_dir, "error", "seq-catch-test-error.log")
    seq_catch_args = [seq_catch_exe, '-r', 'junit', '-o', seq_output_file]
    output = sp.run(seq_launch + seq_catch_args)
    tools.assert_success(output.returncode, seq_error_file)
Example #8
0
def spack_skeleton(dir_name, compiler, mpi_lib, debug):
    compiler_underscored = re.sub('[@\.]', '_', compiler)
    if debug:
        build_type = 'debug'
    else:
        build_type = 'rel'
    output_file_name = '%s/bamboo/compiler_tests/output/%s_%s_spack_output.txt' % (dir_name, compiler_underscored, build_type)
    error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_spack_error.txt' % (dir_name, compiler_underscored, build_type)
    os.chdir('%s/bamboo/compiler_tests/builds' % dir_name)
    debug_flag = ''
    if debug:
        debug_flag = ' -d'
    command = '%s/scripts/spack_recipes/build_lbann.sh -c %s -m %s%s > %s 2> %s' % (
        dir_name, compiler, mpi_lib, debug_flag, output_file_name, error_file_name)
    return_code = os.system(command)
    os.chdir('..')
    tools.assert_success(return_code, error_file_name)
Example #9
0
def test_unit_should_work(cluster, dirname, exes):
    if isinstance(exes, dict):
        exe = exes['gcc7']
    else:
        exe = exes
    print('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n')
    (data_reader_path, model_path, optimizer_path) = get_default_parameters(
        dirname)
    (output_file_name, error_file_name) = get_file_names(dirname, 'should_work')
    command = tools.get_command(
        cluster=cluster, executable=exe, data_reader_path=data_reader_path,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        exit_after_setup=True, model_path=model_path,
        optimizer_path=optimizer_path,
        num_processes=1,
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
def skeleton_jag_reconstruction_loss(cluster, executables, dir_name,
                                     compiler_name, weekly,
                                     data_reader_percent):
    if compiler_name not in executables:
        e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    if cluster == 'ray':
        e = 'skeleton_jag_reconstruction_loss: dataset does not exist on %s' % cluster
        print('Skip - ' + e)
        pytest.skip(e)
    #if cluster == 'lassen':
    #e = 'skeleton_jag_reconstruction_loss: FIXME dataset consistency issues on Lassen'
    #print('Skip - ' + e)
    #pytest.skip(e)
    output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % (
        dir_name, compiler_name)
    error_file_name = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % (
        dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=executables[compiler_name],
        num_nodes=2,
        num_processes=32,
        disable_cuda=1,
        dir_name=dir_name,
        sample_list_train_default=
        '/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/100Kindex.txt',
        sample_list_test_default=
        '/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/t1_sample_list.txt',
        data_reader_name='jag',
        data_reader_percent='prototext',
        metadata='applications/physics/data/jag_100M_metadata.prototext',
        model_folder='tests',
        model_name='jag_single_layer_ae',
        optimizer_name='adam',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code = os.system(command)
    tools.assert_success(return_code, error_file_name)
Example #11
0
def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name,
                                     compiler_name, weekly,
                                     data_reader_percent):
    if compiler_name not in executables:
        e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    exe = executables[compiler_name]
    # Handle data
    if data_reader_percent is None:
        data_reader_percent = 0.01
    # No checkpointing, printing weights to files.
    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (
        dir_name, compiler_name)
    error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (
        dir_name, compiler_name)
    os.system('rm -rf ckpt_lenet_shared && mkdir ckpt_lenet_shared')
    no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=exe,
        num_nodes=1,
        num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        data_reader_percent=data_reader_percent,
        ckpt_dir=no_ckpt_dir,
        model_folder='tests',
        model_name='lenet_mnist_ckpt',
        num_epochs=2,
        optimizer_name='sgd',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code_nockpt = os.system(command)
    tools.assert_success(return_code_nockpt, error_file_name)

    # Run to checkpoint, printing weights to files.
    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (
        dir_name, compiler_name)
    error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (
        dir_name, compiler_name)
    ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=exe,
        num_nodes=1,
        num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        data_reader_percent=data_reader_percent,
        ckpt_dir=ckpt_dir,
        model_folder='tests',
        model_name='lenet_mnist_ckpt',
        num_epochs=1,
        optimizer_name='sgd',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code_ckpt_1 = os.system(command)
    tools.assert_success(return_code_ckpt_1, error_file_name)

    # Pick up from checkpoint, printing weights to files.
    output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (
        dir_name, compiler_name)
    error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_restart_%s_error.txt' % (
        dir_name, compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=exe,
        num_nodes=1,
        num_processes=2,
        dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        data_reader_percent=data_reader_percent,
        ckpt_dir=ckpt_dir,
        model_folder='tests',
        model_name='lenet_mnist_ckpt',
        num_epochs=2,
        optimizer_name='sgd',
        output_file_name=output_file_name,
        error_file_name=error_file_name,
        weekly=weekly)
    return_code_ckpt_2 = os.system(command)
    tools.assert_success(return_code_ckpt_2, error_file_name)

    dcmp = dircmp(ckpt_dir, no_ckpt_dir)
    fail, diffs, warns = tools.print_diff_files(dcmp)
    for w in warns:
        print(w)

    if fail:
        print()
        for d in diffs:
            print(d)
        path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name)
        raise AssertionError('Compare {ncd} and {cd} in {p}'.format(
            ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix))