def post_status(state): """ Post the status with a constructed context to the PR. :param state: <str> choices are "success", "failure", "error" or "pending" """ # Executor does not send logs to GitHub when using the Scheduler if os.getenv("EXECUTOR_MODE", "False").lower() == "true": return project_name = utils.get_codebuild_project_name() trigger_job = os.getenv("TEST_TRIGGER", "UNKNOWN-TEST-TRIGGER") target_url = get_target_url(project_name) context = f"{trigger_job}_{project_name}" description = set_build_description(state, project_name, trigger_job) # Example: "https://github.com/aws/deep-learning-containers.git" repo_url = os.getenv("CODEBUILD_SOURCE_REPO_URL") _, user, repo_name = repo_url.rstrip(".git").rsplit("/", 2) handler = GitHubHandler(user, repo_name) handler.set_status( state=state, context=context, description=description, target_url=target_url )
def post_status(state): """ Post the status with a constructed context to the PR. :param state: <str> choices are "success", "failure", "error" or "pending" """ # Executor does not send logs to GitHub when using the Scheduler if os.getenv("EXECUTOR_MODE", "False").lower() == "true": return project_name = get_codebuild_project_name() target_url = get_target_url(project_name) test_context = os.getenv("TEST_TYPE") if test_context and test_context not in "quick_checks": trigger_job = os.getenv("TEST_TRIGGER", "UNKNOWN-TEST-TRIGGER") context = f"{trigger_job}_{project_name}" else: context = f"{project_name}" description = set_build_description(state, project_name) user, repo_name = get_user_and_repo_name() handler = GitHubHandler(user, repo_name) handler.set_status( state=state, context=context, description=description, target_url=target_url )
def _run_eks_mxnet_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id): """Run MXNet distributed training on EKS using MXNet Operator Args: namespace, app_name, job_name, remote_yaml_file_path """ kubeflow_version = "v0.4.1" home_dir = run("echo $HOME").stdout.strip("\n") path_to_ksonnet_app = os.path.join(home_dir, f"mxnet_multi_node_eks_test-{unique_id}") env = f"{namespace}-env" training_result = False ctx = Context() # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist. does_namespace_exist = ctx.run(f"kubectl get namespace | grep {namespace}", warn=True) if not does_namespace_exist: ctx.run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(f"{path_to_ksonnet_app}"): ctx.run(f"rm -rf {app_name}") github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if not does_registry_exist: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{kubeflow_version}/kubeflow", hide=True, ) ctx.run( f"ks pkg install kubeflow/mxnet-job@{kubeflow_version}", hide=True, ) ctx.run("ks generate mxnet-operator mxnet-operator", hide=True) try: ctx.run(f"kubectl get pods -n {namespace} -o wide") LOGGER.debug(f"ks apply {env} -c mxnet-operator -n {namespace}") ctx.run(f"ks apply {env} -c mxnet-operator -n {namespace}") # Delete old job with same name if exists ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True) ctx.run(f"kubectl create -f {remote_yaml_file_path} -n {namespace}") if is_mxnet_eks_multinode_training_complete(job_name, namespace): training_result = True finally: eks_utils.eks_multinode_cleanup("", job_name, namespace, env) return training_result
def get_pr_modified_files(pr_number): """ Fetch all the files modified for a git pull request and return them as a string :param pr_number: int :return: str with all the modified files """ github_handler = GitHubHandler("aws", "deep-learning-containers") files = github_handler.get_pr_files_changed(pr_number) files = "\n".join(files) return files
def get_pr_modified_files(pr_number): """ Fetch all the files modified for a git pull request and return them as a string :param pr_number: int :return: str with all the modified files """ # This import statement has been placed inside this function because it creates a dependency that is unnecessary # for local builds and builds outside of Pull Requests. from dlc.github_handler import GitHubHandler github_handler = GitHubHandler("aws", "deep-learning-containers") files = github_handler.get_pr_files_changed(pr_number) files = "\n".join(files) return files
def get_pr_modified_files(pr_number): """ Fetch all the files modified for a git pull request and return them as a string :param pr_number: int :return: str with all the modified files """ # This import statement has been placed inside this function because it creates a dependency that is unnecessary # for local builds and builds outside of Pull Requests. from dlc.github_handler import GitHubHandler # Example: "https://github.com/aws/deep-learning-containers.git" repo_url = os.getenv("CODEBUILD_SOURCE_REPO_URL") _, user, repo_name = repo_url.rstrip(".git").rsplit("/", 2) github_handler = GitHubHandler(user, repo_name) files = github_handler.get_pr_files_changed(pr_number) files = "\n".join(files) return files
def post_status(state): """ Post the status with a constructed context to the PR. :param state: <str> choices are "success", "failure", "error" or "pending" """ project_name = utils.get_codebuild_project_name() trigger_job = os.getenv("TEST_TRIGGER", "UNKNOWN-TEST-TRIGGER") target_url = get_target_url(project_name) context = f"{trigger_job}_{project_name}" description = set_build_description(state, project_name, trigger_job) handler = GitHubHandler() handler.set_status( state=state, context=context, description=description, target_url=target_url )
def run_eks_pytorch_multi_node_training(namespace, app_name, job_name, remote_yaml_file_path, unique_id): """Run PyTorch distributed training on EKS using PyTorch Operator Args: namespace, app_name, job_name, remote_yaml_file_path """ KUBEFLOW_VERSION = "v0.6.1" home_dir = run("echo $HOME").stdout.strip("\n") path_to_ksonnet_app = os.path.join( home_dir, f"pytorch_multi_node_eks_test-{unique_id}") env = f"{namespace}-env" ctx = Context() # Namespaces will allow parallel runs on the same cluster. Create namespace if it doesnt exist. does_namespace_exist = run(f"kubectl get namespace | grep {namespace}", warn=True) if not does_namespace_exist: run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(path_to_ksonnet_app): ctx.run(f"rm -rf {app_name}") # Create a new ksonnet app. github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. does_registry_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if not does_registry_exist: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow", ) ctx.run( f"ks pkg install kubeflow/pytorch-job@{KUBEFLOW_VERSION}", ) ctx.run(f"ks generate pytorch-operator pytorch-operator") try: # use `$ks show default` to see details. ctx.run(f"kubectl get pods -n {namespace} -o wide") LOGGER.debug( f"ks apply {env} -c pytorch-operator -n {namespace}") ctx.run( f"ks apply {env} -c pytorch-operator -n {namespace}") # Delete old job with same name if exists ctx.run(f"kubectl delete -f {remote_yaml_file_path}", warn=True) ctx.run( f"kubectl create -f {remote_yaml_file_path} -n {namespace}" ) training_result = is_pytorch_eks_multinode_training_complete( job_name, namespace) if training_result: run_out = run( f"kubectl logs {job_name}-master-0 -n {namespace}", warn=True).stdout if "accuracy" in run_out: training_result = True else: eks_utils.LOGGER.info("**** training output ****") eks_utils.LOGGER.debug(run_out) assert training_result, f"Training for eks pytorch multinode failed" finally: eks_utils.eks_multinode_cleanup(ctx, "", job_name, namespace, env)
def _run_eks_tensorflow_multi_node_training_mpijob( namespace, app_name, custom_image, job_name, command_to_run, args_to_pass, path_to_ksonnet_app, cluster_size, eks_gpus_per_worker): """ Run Tensorflow distributed training on EKS using horovod docker images using MPIJob :param namespace: :param app_name: :param custom_image: :param job_name: :param command_to_run: :param args_to_pass: :param path_to_ksonnet_app: :param cluster_size: :param eks_gpus_per_worker: :return: None """ KUBEFLOW_VERSION = "v0.5.1" pod_name = None env = f"{namespace}-env" ctx = Context() github_handler = GitHubHandler("aws", "kubeflow") github_handler.set_ksonnet_env() ctx.run(f"kubectl create namespace {namespace}") if not os.path.exists(path_to_ksonnet_app): ctx.run(f"mkdir -p {path_to_ksonnet_app}") with ctx.cd(path_to_ksonnet_app): ctx.run(f"rm -rf {app_name}") ctx.run(f"ks init {app_name} --namespace {namespace}") with ctx.cd(app_name): ctx.run(f"ks env add {env} --namespace {namespace}") # Check if the kubeflow registry exists and create. Registry will be available in each pod. registry_not_exist = ctx.run("ks registry list | grep kubeflow", warn=True) if registry_not_exist.return_code: ctx.run( f"ks registry add kubeflow github.com/kubeflow/kubeflow/tree/{KUBEFLOW_VERSION}/kubeflow", ) ctx.run(f"ks pkg install kubeflow/common@{KUBEFLOW_VERSION}") ctx.run(f"ks pkg install kubeflow/mpi-job@{KUBEFLOW_VERSION}") try: ctx.run("ks generate mpi-operator mpi-operator") # The latest mpi-operator docker image does not accept the gpus-per-node parameter # which is specified by the older spec file from v0.5.1. ctx.run( "ks param set mpi-operator image mpioperator/mpi-operator:0.2.0" ) ctx.run( "ks param set mpi-operator kubectlDeliveryImage mpioperator/kubectl-delivery:0.2.0" ) mpi_operator_start = ctx.run(f"ks apply {env} -c mpi-operator", warn=True) if mpi_operator_start.return_code: raise RuntimeError( f"Failed to start mpi-operator:\n{mpi_operator_start.stderr}" ) eks_utils.LOGGER.info( f"The mpi-operator package must be applied to {env} env before we can use mpiJob. " f"Check status before moving on.") ctx.run("kubectl get crd") # Use Ksonnet to generate manifest files which are then applied to the default context. ctx.run(f"ks generate mpi-job-custom {job_name}") ctx.run(f"ks param set {job_name} replicas {cluster_size}") ctx.run( f"ks param set {job_name} gpusPerReplica {eks_gpus_per_worker}" ) ctx.run(f"ks param set {job_name} image {custom_image}") ctx.run(f"ks param set {job_name} command {command_to_run}") ctx.run(f"ks param set {job_name} args {args_to_pass}") # use `$ks show default` to see details. ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( f"Apply the generated manifest to the {env} env.") training_job_start = ctx.run(f"ks apply {env} -c {job_name}", warn=True) if training_job_start.return_code: raise RuntimeError( f"Failed to start {job_name}:\n{training_job_start.stderr}" ) eks_utils.LOGGER.info("Check pods") ctx.run(f"kubectl get pods -n {namespace} -o wide") eks_utils.LOGGER.info( "First the mpi-operator and the n-worker pods will be created and then " "the launcher pod is created in the end. Use retries until launcher " "pod's name is available to read logs.") complete_pod_name = eks_utils.is_mpijob_launcher_pod_ready( ctx, namespace, job_name) _, pod_name = complete_pod_name.split("/") eks_utils.LOGGER.info( f"The Pods have been created and the name of the launcher pod is {pod_name}" ) eks_utils.LOGGER.info( f"Wait for the {job_name} job to complete") if eks_utils.is_eks_multinode_training_complete( ctx, namespace, env, pod_name, job_name): eks_utils.LOGGER.info( f"Wait for the {pod_name} pod to reach completion") distributed_out = ctx.run( f"kubectl logs -n {namespace} -f {complete_pod_name}" ).stdout eks_utils.LOGGER.info(distributed_out) finally: eks_utils.eks_multinode_cleanup(ctx, pod_name, job_name, namespace, env)