def run_ortmodule_deepspeed_pipeline_parallel_tests(cwd, log): log.debug('Running: ORTModule deepspeed pipeline parallel tests') command = ['deepspeed', 'orttraining_test_ortmodule_deepspeed_pipeline_parallel.py', '--deepspeed_config', 'orttraining_test_ortmodule_deepspeed_pipeline_parallel_config.json'] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def main(): ngpus = torch.cuda.device_count() # Declare test scripts for parallel tests. # New test scripts should be added to "dhp_parallel" folder. distributed_test_files = [ os.path.join("dhp_parallel", "orttraining_test_parallel_train_simple_model.py"), os.path.join("dhp_parallel", "orttraining_test_parallel_train_simple_model_fp16.py"), ] # parallel_test_process_number[i] is the number of processes needed to run distributed_test_files[i]. distributed_test_process_counts = [4, 4] log.info("Running parallel training tests.") for test_file, process_count in zip(distributed_test_files, distributed_test_process_counts): if ngpus < process_count: log.error( "Machine Configuration Error. More GPUs are needed to run " + test_file) return 1 log.debug("RUN: " + test_file) command = [ "mpirun", "-n", str(process_count), sys.executable, test_file ] # The current working directory is set in # onnxruntime/orttraining/orttraining/test/python/orttraining_distributed_tests.py run_subprocess(command, cwd=os.getcwd()).check_returncode() return 0
def run_ortmodule_fairscale_sharded_optimizer_tests(cwd, log, data_dir): log.debug('Running: ORTModule fairscale sharded optimizer tests') command = ['python3', 'orttraining_test_ortmodule_fairscale_sharded_optimizer.py', '--use_sharded_optimizer', '--use_ortmodule'] if data_dir: command.extend(['--data-dir', data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_fallback_tests(cwd, log, transformers_cache): log.debug('Running: ORTModule-API tests') env = get_env_with_transformers_cache(transformers_cache) command = [sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_ortmodule_fallback.py'] run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def run_data_sampler_tests(cwd, log): log.debug('Running: Data sampler tests') command = [ sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_sampler.py' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_data_sampler_tests(cwd, log): log.debug("Running: Data sampler tests") command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_sampler.py" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_onnxruntime_test_all_ctest(cwd, log, filter): """Calls onnxruntime_test_all gtest executable with the given filter.""" command = [ os.path.join(cwd, "onnxruntime_test_all"), f"--gtest_filter={filter}" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_experimental_json_config_tests(cwd, log): log.debug("Running: ORTModule Experimental Load Config tests") command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_ortmodule_experimental_json_config.py" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_custom_autograd_tests(cwd, log): log.debug("Running: ORTModule-Custom AutoGrad Functions tests") command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_ortmodule_autograd.py" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hierarchical_ortmodule_tests(cwd, log): log.debug('Running: ORTModule-Hierarchical model tests') command = [ sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_hierarchical_ortmodule.py' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_experimental_gradient_graph_tests(cwd, log): log.debug("Running: Experimental Gradient Graph Export Tests") command = [ sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_experimental_gradient_graph.py' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_experimental_json_config_tests(cwd, log): log.debug('Running: ORTModule Experimental Load Config tests') command = [ sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_ortmodule_experimental_json_config.py' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hierarchical_ortmodule_tests(cwd, log): log.debug("Running: ORTModule-Hierarchical model tests") command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_hierarchical_ortmodule.py" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_pytorch_ddp_tests(cwd, log): log.debug('Running: ORTModule Pytorch DDP tests') command = [ sys.executable, 'orttraining_test_ortmodule_pytorch_ddp.py', '--use_ort_module' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_custom_autograd_tests(cwd, log): log.debug('Running: ORTModule-Custom AutoGrad Functions tests') command = [ sys.executable, '-m', 'pytest', '-sv', 'orttraining_test_ortmodule_autograd.py' ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_deepspeed_zero_stage_1_tests(cwd, log, data_dir): log.debug('Running: ORTModule deepspeed zero stage 1 tests') command = ['deepspeed', 'orttraining_test_ortmodule_deepspeed_zero_stage_1.py', '--deepspeed_config', 'orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json'] if data_dir: command.extend(['--data-dir', data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_torch_lightning(cwd, log, data_dir): log.debug('Running: ORTModule PyTorch Lightning sample .') command = [sys.executable, 'orttraining_test_ortmodule_torch_lightning_basic.py', '--train-steps=470', '--epochs=2', '--batch-size=256'] if data_dir: command.extend(['--data-dir', data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_onnxblock_tests(cwd, log): """Runs the offline tooling tests for on-device training.""" log.debug("Running: onnxblock tests") command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_onnxblock.py" ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_deepspeed_pipeline_parallel_tests(cwd, log): log.debug("Running: ORTModule deepspeed pipeline parallel tests") command = [ "deepspeed", "orttraining_test_ortmodule_deepspeed_pipeline_parallel.py", "--deepspeed_config", "orttraining_test_ortmodule_deepspeed_pipeline_parallel_config.json", ] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_fallback_tests(cwd, log, transformers_cache): log.debug("Running: ORTModule-API tests") env = get_env_with_transformers_cache(transformers_cache) command = [ sys.executable, "-m", "pytest", "-sv", "orttraining_test_ortmodule_fallback.py" ] run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def run_ortmodule_poc_net(cwd, log, no_cuda, data_dir): log.debug('Running: ORTModule POCNet for MNIST with --no-cuda arg {}.'.format(no_cuda)) command = [sys.executable, 'orttraining_test_ortmodule_poc.py'] if no_cuda: command.extend(['--no-cuda', '--epochs', str(3)]) if data_dir: command.extend(['--data-dir', data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_fairscale_sharded_optimizer_tests(cwd, log, data_dir): log.debug("Running: ORTModule fairscale sharded optimizer tests") command = [ "python3", "orttraining_test_ortmodule_fairscale_sharded_optimizer.py", "--use_sharded_optimizer", "--use_ortmodule", ] if data_dir: command.extend(["--data-dir", data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_ortmodule_hf_bert_for_sequence_classification_from_pretrained(cwd, log, no_cuda, data_dir, transformers_cache): log.debug('Running: ORTModule HuggingFace BERT for sequence classification with --no-cuda arg {}.'.format(no_cuda)) env = get_env_with_transformers_cache(transformers_cache) command = [sys.executable, 'orttraining_test_ortmodule_bert_classifier.py'] if no_cuda: command.extend(['--no-cuda', '--epochs', str(3)]) if data_dir: command.extend(['--data-dir', data_dir]) run_subprocess(command, cwd=cwd, log=log, env=env).check_returncode()
def main(): import torch ngpus = torch.cuda.device_count() # TODO: currently the CI machine only has 4 GPUs for parallel tests. # Fill in more pipeline partition options when the machine has different GPUs counts. if ngpus != 4: return 0 log.info("Running pipeline e2e tests.") args = parse_arguments() cwd = args.cwd command = [ './onnxruntime_training_bert', '--ort_log_severity', '1', '--optimizer=Lamb', '--learning_rate=3e-3', '--max_seq_length=128', '--max_predictions_per_seq=20', '--warmup_ratio=0.2843', '--warmup_mode=Poly', '--model_name', '/bert_ort/bert_models/nv/bert-large/' + 'bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12', '--train_data_dir', '/bert_data/128/books_wiki_en_corpus/train', '--test_data_dir', '/bert_data/128/books_wiki_en_corpus/test', '--display_loss_steps', '1', '--use_nccl', '--use_mixed_precision', '--allreduce_in_fp16', '--gradient_accumulation_steps', '48', '--num_train_steps', '96', '--train_batch_size', '50' ] # Test 4-way pipeline parallel pp_command = ['mpirun', '-n', str(ngpus)] + command + [ '--pipeline_parallel_size', '4', '--cut_group_info', '1149:407-1219/1341/1463/1585/1707/1829,' + '1881:407-1951/2073/2195/2317/2439/2561,' + '2613:407-2683/2805/2927/3049/3171/3293' ] command_str = ', '.join(pp_command) log.debug('RUN: ' + command_str) run_subprocess(pp_command, cwd=cwd, log=log) # Test 2-way data parallel + 2-way pipeline parallel pp_dp_command = ['mpirun', '-n', str(ngpus)] pp_dp_command = pp_dp_command + command pp_dp_command = pp_dp_command + [ '--data_parallel_size', '2', '--pipeline_parallel_size', '2', '--cut_group_info', '1881:407-1951/2073/2195/2317/2439/2561/2683/2805/2927/3049/3171/3293' ] command_str = ', '.join(pp_dp_command) log.debug('RUN: ' + command_str) run_subprocess(pp_dp_command, cwd=cwd, log=log) return 0
def run_ortmodule_deepspeed_zero_stage_1_tests(cwd, log, data_dir): log.debug("Running: ORTModule deepspeed zero stage 1 tests") command = [ "deepspeed", "orttraining_test_ortmodule_deepspeed_zero_stage_1.py", "--deepspeed_config", "orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json", ] if data_dir: command.extend(["--data-dir", data_dir]) run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_pipeline_parallel_tests(cwd, log): log.debug("Running: pipeline parallel tests") command = [sys.executable, "orttraining_test_dhp_parallel_tests.py"] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_distributed_allreduce_tests(cwd, log): log.debug("Running: distributed allreduce tests") command = [sys.executable, "orttraining_test_allreduce.py"] run_subprocess(command, cwd=cwd, log=log).check_returncode()
def run_checkpoint_tests(cwd, log): log.debug("Running: Checkpoint tests") command = [sys.executable, "orttraining_test_checkpoint.py"] run_subprocess(command, cwd=cwd, log=log).check_returncode()
parser = argparse.ArgumentParser() parser.add_argument( "--cmd_line_with_args", required=True, help="command line with arguments to be executed in a subprocess. \ it expects a single string containing arguments separated by spaces.", ) parser.add_argument("--cwd", help="working directory") # parser.add_argument("--env", help="env variables.") parser.add_argument("--env", help="env variables", nargs=2, action="append", default=[]) return parser.parse_args() launch_args = parse_arguments() print("sys.executable: ", sys.executable) cmd_line_with_args = launch_args.cmd_line_with_args.split() for n, arg in enumerate(cmd_line_with_args): if arg == "python": cmd_line_with_args[n] = sys.executable run_subprocess(cmd_line_with_args, cwd=launch_args.cwd, env=dict(launch_args.env), log=log)