def run_ortmodule_deepspeed_pipeline_parallel_tests(cwd, log):
    log.debug('Running: ORTModule deepspeed pipeline parallel tests')

    command = ['deepspeed', 'orttraining_test_ortmodule_deepspeed_pipeline_parallel.py',
        '--deepspeed_config', 'orttraining_test_ortmodule_deepspeed_pipeline_parallel_config.json']

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #2
0
def main():
    ngpus = torch.cuda.device_count()

    # Declare test scripts for parallel tests.
    # New test scripts should be added to "dhp_parallel" folder.
    distributed_test_files = [
        os.path.join("dhp_parallel",
                     "orttraining_test_parallel_train_simple_model.py"),
        os.path.join("dhp_parallel",
                     "orttraining_test_parallel_train_simple_model_fp16.py"),
    ]
    # parallel_test_process_number[i] is the number of processes needed to run distributed_test_files[i].
    distributed_test_process_counts = [4, 4]

    log.info("Running parallel training tests.")
    for test_file, process_count in zip(distributed_test_files,
                                        distributed_test_process_counts):
        if ngpus < process_count:
            log.error(
                "Machine Configuration Error. More GPUs are needed to run " +
                test_file)
            return 1
        log.debug("RUN: " + test_file)

        command = [
            "mpirun", "-n",
            str(process_count), sys.executable, test_file
        ]

        # The current working directory is set in
        # onnxruntime/orttraining/orttraining/test/python/orttraining_distributed_tests.py
        run_subprocess(command, cwd=os.getcwd()).check_returncode()

    return 0
def run_ortmodule_fairscale_sharded_optimizer_tests(cwd, log, data_dir):
    log.debug('Running: ORTModule fairscale sharded optimizer tests')
    command = ['python3', 'orttraining_test_ortmodule_fairscale_sharded_optimizer.py',
               '--use_sharded_optimizer', '--use_ortmodule']
    if data_dir:
        command.extend(['--data-dir', data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_fallback_tests(cwd, log, transformers_cache):
    log.debug('Running: ORTModule-API tests')

    env = get_env_with_transformers_cache(transformers_cache)

    command = [sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_ortmodule_fallback.py']

    run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def run_data_sampler_tests(cwd, log):
    log.debug('Running: Data sampler tests')

    command = [
        sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_sampler.py'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_data_sampler_tests(cwd, log):
    log.debug("Running: Data sampler tests")

    command = [
        sys.executable, "-m", "pytest", "-sv", "orttraining_test_sampler.py"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #7
0
def run_onnxruntime_test_all_ctest(cwd, log, filter):
    """Calls onnxruntime_test_all gtest executable with the given filter."""

    command = [
        os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_experimental_json_config_tests(cwd, log):
    log.debug("Running: ORTModule Experimental Load Config tests")

    command = [
        sys.executable, "-m", "pytest", "-sv",
        "orttraining_test_ortmodule_experimental_json_config.py"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_custom_autograd_tests(cwd, log):
    log.debug("Running: ORTModule-Custom AutoGrad Functions tests")

    command = [
        sys.executable, "-m", "pytest", "-sv",
        "orttraining_test_ortmodule_autograd.py"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hierarchical_ortmodule_tests(cwd, log):
    log.debug('Running: ORTModule-Hierarchical model tests')

    command = [
        sys.executable, '-m', 'pytest', '-sv',
        'orttraining_test_hierarchical_ortmodule.py'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_experimental_gradient_graph_tests(cwd, log):
    log.debug("Running: Experimental Gradient Graph Export Tests")

    command = [
        sys.executable, '-m', 'pytest', '-sv',
        'orttraining_test_experimental_gradient_graph.py'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_experimental_json_config_tests(cwd, log):
    log.debug('Running: ORTModule Experimental Load Config tests')

    command = [
        sys.executable, '-m', 'pytest', '-sv',
        'orttraining_test_ortmodule_experimental_json_config.py'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hierarchical_ortmodule_tests(cwd, log):
    log.debug("Running: ORTModule-Hierarchical model tests")

    command = [
        sys.executable, "-m", "pytest", "-sv",
        "orttraining_test_hierarchical_ortmodule.py"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_pytorch_ddp_tests(cwd, log):
    log.debug('Running: ORTModule Pytorch DDP tests')

    command = [
        sys.executable, 'orttraining_test_ortmodule_pytorch_ddp.py',
        '--use_ort_module'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_custom_autograd_tests(cwd, log):
    log.debug('Running: ORTModule-Custom AutoGrad Functions tests')

    command = [
        sys.executable, '-m', 'pytest', '-sv',
        'orttraining_test_ortmodule_autograd.py'
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_deepspeed_zero_stage_1_tests(cwd, log, data_dir):
    log.debug('Running: ORTModule deepspeed zero stage 1 tests')

    command = ['deepspeed', 'orttraining_test_ortmodule_deepspeed_zero_stage_1.py',
        '--deepspeed_config', 'orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json']

    if data_dir:
        command.extend(['--data-dir', data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_torch_lightning(cwd, log, data_dir):
    log.debug('Running: ORTModule PyTorch Lightning sample .')

    command = [sys.executable, 'orttraining_test_ortmodule_torch_lightning_basic.py', '--train-steps=470',
               '--epochs=2', '--batch-size=256']

    if data_dir:
        command.extend(['--data-dir', data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #18
0
def run_onnxblock_tests(cwd, log):
    """Runs the offline tooling tests for on-device training."""

    log.debug("Running: onnxblock tests")

    command = [
        sys.executable, "-m", "pytest", "-sv", "orttraining_test_onnxblock.py"
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #19
0
def run_ortmodule_deepspeed_pipeline_parallel_tests(cwd, log):
    log.debug("Running: ORTModule deepspeed pipeline parallel tests")

    command = [
        "deepspeed",
        "orttraining_test_ortmodule_deepspeed_pipeline_parallel.py",
        "--deepspeed_config",
        "orttraining_test_ortmodule_deepspeed_pipeline_parallel_config.json",
    ]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_fallback_tests(cwd, log, transformers_cache):
    log.debug("Running: ORTModule-API tests")

    env = get_env_with_transformers_cache(transformers_cache)

    command = [
        sys.executable, "-m", "pytest", "-sv",
        "orttraining_test_ortmodule_fallback.py"
    ]

    run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def run_ortmodule_poc_net(cwd, log, no_cuda, data_dir):
    log.debug('Running: ORTModule POCNet for MNIST with --no-cuda arg {}.'.format(no_cuda))

    command = [sys.executable, 'orttraining_test_ortmodule_poc.py']
    if no_cuda:
        command.extend(['--no-cuda', '--epochs', str(3)])

    if data_dir:
        command.extend(['--data-dir', data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #22
0
def run_ortmodule_fairscale_sharded_optimizer_tests(cwd, log, data_dir):
    log.debug("Running: ORTModule fairscale sharded optimizer tests")
    command = [
        "python3",
        "orttraining_test_ortmodule_fairscale_sharded_optimizer.py",
        "--use_sharded_optimizer",
        "--use_ortmodule",
    ]
    if data_dir:
        command.extend(["--data-dir", data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hf_bert_for_sequence_classification_from_pretrained(cwd, log, no_cuda, data_dir, transformers_cache):
    log.debug('Running: ORTModule HuggingFace BERT for sequence classification with --no-cuda arg {}.'.format(no_cuda))

    env = get_env_with_transformers_cache(transformers_cache)

    command = [sys.executable, 'orttraining_test_ortmodule_bert_classifier.py']
    if no_cuda:
        command.extend(['--no-cuda', '--epochs', str(3)])

    if data_dir:
        command.extend(['--data-dir', data_dir])

    run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def main():
    import torch
    ngpus = torch.cuda.device_count()

    # TODO: currently the CI machine only has 4 GPUs for parallel tests.
    # Fill in more pipeline partition options when the machine has different GPUs counts.
    if ngpus != 4:
        return 0

    log.info("Running pipeline e2e tests.")

    args = parse_arguments()
    cwd = args.cwd

    command = [
        './onnxruntime_training_bert', '--ort_log_severity', '1',
        '--optimizer=Lamb', '--learning_rate=3e-3', '--max_seq_length=128',
        '--max_predictions_per_seq=20', '--warmup_ratio=0.2843',
        '--warmup_mode=Poly', '--model_name',
        '/bert_ort/bert_models/nv/bert-large/' +
        'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12',
        '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train',
        '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test',
        '--display_loss_steps', '1', '--use_nccl', '--use_mixed_precision',
        '--allreduce_in_fp16', '--gradient_accumulation_steps', '48',
        '--num_train_steps', '96', '--train_batch_size', '50'
    ]

    # Test 4-way pipeline parallel
    pp_command = ['mpirun', '-n', str(ngpus)] + command + [
        '--pipeline_parallel_size', '4', '--cut_group_info',
        '1149:407-1219/1341/1463/1585/1707/1829,' +
        '1881:407-1951/2073/2195/2317/2439/2561,' +
        '2613:407-2683/2805/2927/3049/3171/3293'
    ]
    command_str = ', '.join(pp_command)
    log.debug('RUN: ' + command_str)
    run_subprocess(pp_command, cwd=cwd, log=log)

    # Test 2-way data parallel + 2-way pipeline parallel
    pp_dp_command = ['mpirun', '-n', str(ngpus)]
    pp_dp_command = pp_dp_command + command
    pp_dp_command = pp_dp_command + [
        '--data_parallel_size', '2', '--pipeline_parallel_size', '2',
        '--cut_group_info',
        '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293'
    ]
    command_str = ', '.join(pp_dp_command)
    log.debug('RUN: ' + command_str)
    run_subprocess(pp_dp_command, cwd=cwd, log=log)
    return 0
Example #25
0
def run_ortmodule_deepspeed_zero_stage_1_tests(cwd, log, data_dir):
    log.debug("Running: ORTModule deepspeed zero stage 1 tests")

    command = [
        "deepspeed",
        "orttraining_test_ortmodule_deepspeed_zero_stage_1.py",
        "--deepspeed_config",
        "orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json",
    ]

    if data_dir:
        command.extend(["--data-dir", data_dir])

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_pipeline_parallel_tests(cwd, log):
    log.debug("Running: pipeline parallel tests")

    command = [sys.executable, "orttraining_test_dhp_parallel_tests.py"]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_distributed_allreduce_tests(cwd, log):
    log.debug("Running: distributed allreduce tests")

    command = [sys.executable, "orttraining_test_allreduce.py"]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_checkpoint_tests(cwd, log):
    log.debug("Running: Checkpoint tests")

    command = [sys.executable, "orttraining_test_checkpoint.py"]

    run_subprocess(command, cwd=cwd, log=log).check_returncode()
Example #29
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--cmd_line_with_args",
        required=True,
        help="command line with arguments to be executed in a subprocess. \
        it expects a single string containing arguments separated by spaces.",
    )
    parser.add_argument("--cwd", help="working directory")
    # parser.add_argument("--env", help="env variables.")
    parser.add_argument("--env",
                        help="env variables",
                        nargs=2,
                        action="append",
                        default=[])

    return parser.parse_args()


launch_args = parse_arguments()

print("sys.executable: ", sys.executable)
cmd_line_with_args = launch_args.cmd_line_with_args.split()
for n, arg in enumerate(cmd_line_with_args):
    if arg == "python":
        cmd_line_with_args[n] = sys.executable

run_subprocess(cmd_line_with_args,
               cwd=launch_args.cwd,
               env=dict(launch_args.env),
               log=log)