コード例 #1
0
def skeleton_performance_cache_alexnet(cluster, dir_name, executables, weekly,
                                       compiler_name):
    if not weekly:
        pytest.skip('Not doing weekly testing')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    executable = executables[compiler_name]
    model_name = 'cache_alexnet'
    should_log = False
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (
        dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (
        dir_name, model_name, compiler_name)
    if (cluster in ['catalyst', 'surface']):
        command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (
            dir_name, model_name, output_file_name)
    elif cluster == 'ray':
        pytest.skip(
            'Ray is unsupported for skeleton_performance_cache_alexnet')
    else:
        raise Exception('Unsupported Cluster %s' % cluster)
    common_code.run_lbann(command, model_name, output_file_name,
                          error_file_name,
                          should_log)  # Don't need return value
    actual_performance = common_code.extract_data(output_file_name,
                                                  DATA_FIELDS, should_log)
    run_tests(actual_performance, model_name, dirname, should_log,
              compiler_name, cluster)
コード例 #2
0
def skeleton_mnist_debug(cluster,
                         dir_name,
                         executables,
                         compiler_name,
                         weekly,
                         debug,
                         should_log=False):
    # If weekly or debug are true, then run the test.
    if (not weekly) and (not debug):
        pytest.skip('Not doing weekly or debug testing')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    model_name = 'lenet_mnist'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (
        dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (
        dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster,
        executable=executables[compiler_name],
        num_nodes=1,
        partition='pbatch',
        time_limit=100,
        dir_name=dir_name,
        data_filedir_ray='/p/gscratchr/brainusr/datasets/MNIST',
        data_reader_name='mnist',
        model_folder='models/' + model_name,
        model_name=model_name,
        num_epochs=5,
        optimizer_name='adagrad',
        output_file_name=output_file_name,
        error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name,
                                         error_file_name)
    assert output_value == 0
コード例 #3
0
def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly,
                         debug_build, should_log=False):
    # If weekly or debug_build are true, then run the test.
    if not (weekly or debug_build):
        e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing'
        print('Skip - ' + e)
        pytest.skip(e)
    if cluster == 'ray':
        e = 'skeleton_cifar_debug: cifar not operational on Ray'
        print('Skip - ' + e)
        pytest.skip(e)
    if compiler_name not in executables:
        e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    model_name = 'autoencoder_cifar10'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name],	num_nodes=1,
        partition='pbatch', time_limit=100, dir_name=dir_name,
        data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin',
        data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin',
        data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name,
        model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad',
        output_file_name=output_file_name, error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log)
    assert output_value == 0
コード例 #4
0
def skeleton_performance_full_alexnet(cluster, dir_name, executables, compiler_name, weekly):
  if not weekly:
    pytest.skip('Not doing weekly testing')
  if compiler_name not in executables:
    pytest.skip('default_exes[%s] does not exist' % compiler_name)
  executable = executables[compiler_name]
  if not os.path.exists(executable):
    pytest.skip('Executable does not exist: %s' % executable)
  model_name = 'full_alexnet'
  should_log = False
  output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
  error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) 
  if (cluster in ['catalyst', 'surface']):
    command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (dir_name, model_name, output_file_name)
  elif cluster == 'ray':
    pytest.skip('Ray is unsupported for skeleton_performance_full_alexnet')
  else:
    raise Exception('Unsupported Cluster %s' % cluster)
  common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value
  actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS, should_log)
  run_tests(actual_performance, model_name, dirname, should_log, compiler_name, cluster)
コード例 #5
0
def skeleton_performance_full_alexnet(cluster, dir_name, executables,
                                      compiler_name, weekly, run):
    if not run:
        e = 'skeleton_performance_full_alexnet: Ignored'
        print('Skip - ' + e)
        pytest.skip(e)
    if not weekly:
        e = 'skeleton_performance_full_alexnet: Non-local testing'
        print('Skip - ' + e)
        pytest.skip(e)
    if compiler_name not in executables:
        e = 'skeleton_performance_full_alexnet: default_exes[%s] does not exist' % compiler_name
        print('Skip - ' + e)
        pytest.skip(e)
    executable = executables[compiler_name]
    if not os.path.exists(executable):
        pytest.skip('Executable does not exist: %s' % executable)
    model_name = 'full_alexnet'
    should_log = True
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (
        dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (
        dir_name, model_name, compiler_name)
    if cluster in ['catalyst']:
        command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (
            dir_name, model_name, output_file_name, error_file_name)
    elif cluster in ['pascal', 'ray']:
        e = 'skeleton_performance_full_alexnet: Pascal, Ray are unsupported for skeleton_performance_full_alexnet'
        print('Skip - ' + e)
        pytest.skip(e)
    else:
        raise Exception('Unsupported Cluster %s' % cluster)
    common_code.run_lbann(command, model_name, output_file_name,
                          error_file_name,
                          should_log)  # Don't need return value
    actual_performance = common_code.extract_data(output_file_name,
                                                  DATA_FIELDS, should_log)
    run_tests(actual_performance, model_name, dir_name, should_log,
              compiler_name, cluster)
コード例 #6
0
ファイル: test_integration_debug.py プロジェクト: LLNL/lbann
def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False):
    # If weekly or debug are true, then run the test.
    if (not weekly) and (not debug):
        pytest.skip('Not doing weekly or debug testing')
    if compiler_name not in executables:
      pytest.skip('default_exes[%s] does not exist' % compiler_name)
    model_name = 'lenet_mnist'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name)
    command = tools.get_command(
        cluster=cluster, executable=executables[compiler_name], num_nodes=1,
        partition='pbatch', time_limit=100, dir_name=dir_name,
        data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
        data_reader_name='mnist', model_folder='models/' + model_name,
        model_name=model_name, num_epochs=5, optimizer_name='adagrad',
        output_file_name=output_file_name, error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
    assert output_value == 0
コード例 #7
0
def skeleton_cifar_debug(cluster,
                         dir_name,
                         executables,
                         compiler_name,
                         weekly,
                         debug,
                         should_log=False):
    # If weekly or debug are true, then run the test.
    if (not weekly) and (not debug):
        pytest.skip('Not doing weekly or debug testing')
    if cluster == 'ray':
        pytest.skip('cifar not operational on Ray')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    model_name = 'autoencoder_cifar10'
    output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (
        dir_name, model_name, compiler_name)
    error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (
        dir_name, model_name, compiler_name)
    command = tools.get_command(cluster=cluster,
                                executable=executables[compiler_name],
                                num_nodes=1,
                                partition='pbatch',
                                time_limit=100,
                                dir_name=dir_name,
                                data_reader_name='cifar10',
                                data_reader_percent=0.01,
                                model_folder='models/' + model_name,
                                model_name='conv_' + model_name,
                                num_epochs=5,
                                optimizer_name='adagrad',
                                output_file_name=output_file_name,
                                error_file_name=error_file_name)
    output_value = common_code.run_lbann(command, model_name, output_file_name,
                                         error_file_name)
    assert output_value == 0
コード例 #8
0
def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly):
    if not weekly:
        pytest.skip('Not doing weekly testing')
    if cluster == 'surface':
        pytest.skip('skeleton_io_buffers does not run on surface')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    max_mb = 300
    # Printing output from 6*6*2=72 runs of LBANN makes the logs too slow.
    # Output from run_lbann is still printed - if there is a failure.
    should_log = False
    partitioned = 'mnist_partitioned_io'
    distributed = 'mnist_distributed_io'
    model_names = [partitioned, distributed]
    accuracies = {}
    errors = []
    all_values = []
    fatal_errors = []
    overall_min_partitioned_accuracy = float('inf')
    overall_min_distributed_accuracy = float('inf')
    for mini_batch_size in [300, 150, 100, 75, 60, 50]:
        num_models = max_mb / mini_batch_size
        for procs_per_model in [1, 2, 3, 4, 5, 6]:
            num_ranks = procs_per_model * num_models
            for model_name in model_names:
                output_file_name = '%s/bamboo/integration_tests/output/%s_%d_%d_output.txt' % (dir_name, model_name, mini_batch_size, procs_per_model)
                error_file_name = '%s/bamboo/integration_tests/error/%s_%d_%d_error.txt' % (dir_name, model_name, mini_batch_size, procs_per_model)
                command = tools.get_command(
                    cluster=cluster, executable=executables[compiler_name], num_nodes=2,
                    num_processes=num_ranks, dir_name=dir_name,
                    data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
                    data_reader_name='mnist', mini_batch_size=mini_batch_size,
                    model_folder='tests', model_name=model_name, num_epochs=5,
                    optimizer_name='adagrad',
                    processes_per_model=procs_per_model,
                    output_file_name=output_file_name, error_file_name=error_file_name)
                try:
                    common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) # Don't need return value
                    accuracy_dict = common_code.extract_data(output_file_name, ['test_accuracy'], should_log)
                    accuracies[model_name] = accuracy_dict['test_accuracy']
                except Exception:
                    # We want to keep running to see if any other mini_batch_size & procs_per_model combination crashes.
                    # However, it is now pointless to compare accuracies.
                    fatal_errors.append('Crashed running %s with mini_batch_size=%d, procs_per_model=%d' % (model_name, mini_batch_size, procs_per_model))
            # End model name loop
            if fatal_errors == []:
                partitioned_num_models = len(accuracies[partitioned].keys())
                distributed_num_models = len(accuracies[distributed].keys())
                assert partitioned_num_models == distributed_num_models

                min_partitioned_accuracy = float('inf')
                min_distributed_accuracy = float('inf')
                for model_num in sorted(accuracies[partitioned].keys()):
                    partitioned_accuracy = accuracies[partitioned][model_num]['overall']
                    distributed_accuracy = accuracies[distributed][model_num]['overall']
                    if partitioned_accuracy < min_partitioned_accuracy:
                        min_partitioned_accuracy = partitioned_accuracy
                    if distributed_accuracy < min_distributed_accuracy:
                        min_distributed_accuracy = distributed_accuracy
                    tolerance = 0.05
                    # Are we within tolerance * expected_value?
                    if abs(partitioned_accuracy - distributed_accuracy) > abs(tolerance * min(partitioned_accuracy, distributed_accuracy)):
                        errors.append('partitioned = %f != %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model))
                        all_values.append('partitioned = %f, %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d' % (partitioned_accuracy, distributed_accuracy, model_num, mini_batch_size, procs_per_model))
                # End model_num loop
                if min_partitioned_accuracy < overall_min_partitioned_accuracy:
                    overall_min_partitioned_accuracy = min_partitioned_accuracy
                if min_distributed_accuracy < overall_min_distributed_accuracy:
                    overall_min_distributed_accuracy = min_distributed_accuracy
            # End fatal_errors == [] block
        # End procs_per_model loop
    # End mini_batch_size loop
    for fatal_error in fatal_errors:
        print(fatal_error)
    assert fatal_errors == []
    # If there were no fatal errors, archive the accuracies.
    if os.environ['LOGNAME'] == 'lbannusr':
        key = 'bamboo_planKey'
        if key in os.environ:
            plan = os.environ[key]
            if plan in ['LBANN-NIGHTD', 'LBANN-WD']:
                archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/io_buffers.txt' % (plan, cluster, compiler_name)
                with open(archive_file, 'a') as archive:
                    archive.write('%s, %f, %f\n' % (os.environ['bamboo_buildNumber'], overall_min_partitioned_accuracy, overall_min_distributed_accuracy))
            else:
                print('The plan %s does not have archiving activated' % plan)
        else:
            print('%s is not in os.environ' % key)
    else:
        print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME'])

    print('Errors for: partitioned_and_distributed %s (%d)' % (compiler_name, len(errors)))
    for error in errors:
        print(error)
    if should_log:
        print('All values for: partitioned_and_distributed %s (%d)' % (compiler_name, len(all_values)))
        for value in all_values:
            print(value)
    assert errors == []
コード例 #9
0
def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly):
    if not weekly:
        pytest.skip('Not doing weekly testing')
    if cluster == 'surface':
        pytest.skip('skeleton_io_buffers does not run on surface')
    if compiler_name not in executables:
        pytest.skip('default_exes[%s] does not exist' % compiler_name)
    max_mb = 300
    # Printing output from 6*6*2=72 runs of LBANN makes the logs too slow.
    # Output from run_lbann is still printed - if there is a failure.
    should_log = False
    partitioned = 'mnist_partitioned_io'
    distributed = 'mnist_distributed_io'
    model_names = [partitioned, distributed]
    accuracies = {}
    errors = []
    all_values = []
    fatal_errors = []
    overall_min_partitioned_accuracy = float('inf')
    overall_min_distributed_accuracy = float('inf')
    for mini_batch_size in [300, 150, 100, 75, 60, 50]:
        num_models = max_mb / mini_batch_size
        for procs_per_model in [1, 2, 3, 4, 5, 6]:
            num_ranks = procs_per_model * num_models
            for model_name in model_names:
                output_file_name = '%s/bamboo/integration_tests/output/%s_%d_%d_output.txt' % (
                    dir_name, model_name, mini_batch_size, procs_per_model)
                error_file_name = '%s/bamboo/integration_tests/error/%s_%d_%d_error.txt' % (
                    dir_name, model_name, mini_batch_size, procs_per_model)
                command = tools.get_command(
                    cluster=cluster,
                    executable=executables[compiler_name],
                    num_nodes=2,
                    num_processes=num_ranks,
                    dir_name=dir_name,
                    data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
                    data_reader_name='mnist',
                    mini_batch_size=mini_batch_size,
                    model_folder='tests',
                    model_name=model_name,
                    num_epochs=5,
                    optimizer_name='adagrad',
                    processes_per_model=procs_per_model,
                    output_file_name=output_file_name,
                    error_file_name=error_file_name)
                try:
                    common_code.run_lbann(
                        command, model_name, output_file_name, error_file_name,
                        should_log)  # Don't need return value
                    accuracy_dict = common_code.extract_data(
                        output_file_name, ['test_accuracy'], should_log)
                    accuracies[model_name] = accuracy_dict['test_accuracy']
                except Exception:
                    # We want to keep running to see if any other mini_batch_size & procs_per_model combination crashes.
                    # However, it is now pointless to compare accuracies.
                    fatal_errors.append(
                        'Crashed running %s with mini_batch_size=%d, procs_per_model=%d'
                        % (model_name, mini_batch_size, procs_per_model))
            # End model name loop
            if fatal_errors == []:
                partitioned_num_models = len(accuracies[partitioned].keys())
                distributed_num_models = len(accuracies[distributed].keys())
                assert partitioned_num_models == distributed_num_models

                min_partitioned_accuracy = float('inf')
                min_distributed_accuracy = float('inf')
                for model_num in sorted(accuracies[partitioned].keys()):
                    partitioned_accuracy = accuracies[partitioned][model_num][
                        'overall']
                    distributed_accuracy = accuracies[distributed][model_num][
                        'overall']
                    if partitioned_accuracy < min_partitioned_accuracy:
                        min_partitioned_accuracy = partitioned_accuracy
                    if distributed_accuracy < min_distributed_accuracy:
                        min_distributed_accuracy = distributed_accuracy
                    tolerance = 0.05
                    # Are we within tolerance * expected_value?
                    if abs(partitioned_accuracy - distributed_accuracy) > abs(
                            tolerance *
                            min(partitioned_accuracy, distributed_accuracy)):
                        errors.append(
                            'partitioned = %f != %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d'
                            % (partitioned_accuracy, distributed_accuracy,
                               model_num, mini_batch_size, procs_per_model))
                        all_values.append(
                            'partitioned = %f, %f = distributed; model_num=%s mini_batch_size=%d procs_per_model=%d'
                            % (partitioned_accuracy, distributed_accuracy,
                               model_num, mini_batch_size, procs_per_model))
                # End model_num loop
                if min_partitioned_accuracy < overall_min_partitioned_accuracy:
                    overall_min_partitioned_accuracy = min_partitioned_accuracy
                if min_distributed_accuracy < overall_min_distributed_accuracy:
                    overall_min_distributed_accuracy = min_distributed_accuracy
            # End fatal_errors == [] block
        # End procs_per_model loop
    # End mini_batch_size loop
    for fatal_error in fatal_errors:
        print(fatal_error)
    assert fatal_errors == []
    # If there were no fatal errors, archive the accuracies.
    if os.environ['LOGNAME'] == 'lbannusr':
        key = 'bamboo_planKey'
        if key in os.environ:
            plan = os.environ[key]
            if plan in ['LBANN-NIGHTD', 'LBANN-WD']:
                archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/io_buffers.txt' % (
                    plan, cluster, compiler_name)
                with open(archive_file, 'a') as archive:
                    archive.write('%s, %f, %f\n' %
                                  (os.environ['bamboo_buildNumber'],
                                   overall_min_partitioned_accuracy,
                                   overall_min_distributed_accuracy))
            else:
                print('The plan %s does not have archiving activated' % plan)
        else:
            print('%s is not in os.environ' % key)
    else:
        print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME'])

    print('Errors for: partitioned_and_distributed %s (%d)' %
          (compiler_name, len(errors)))
    for error in errors:
        print(error)
    if should_log:
        print('All values for: partitioned_and_distributed %s (%d)' %
              (compiler_name, len(all_values)))
        for value in all_values:
            print(value)
    assert errors == []