def is_test_job_enabled(test_type): # only ec2 and sagemaker benchmark tests are supported currently sm_tests_enabled = parse_dlc_developer_configs("test", "sagemaker_tests") ec2_tests_enabled = parse_dlc_developer_configs("test", "ec2_tests") ecs_tests_enabled = parse_dlc_developer_configs("test", "ecs_tests") eks_tests_enabled = parse_dlc_developer_configs("test", "eks_tests") sanity_tests_enabled = parse_dlc_developer_configs("test", "sanity_tests") benchmark_mode = is_benchmark_mode_enabled() # For each test type, see if we should run the tests if test_type == constants.SAGEMAKER_TESTS and sm_tests_enabled: return True if test_type == constants.EC2_TESTS and ec2_tests_enabled: return True # We have no ECS/EKS/SANITY benchmark tests if not benchmark_mode: if test_type == constants.ECS_TESTS and ecs_tests_enabled: return True if test_type == constants.EKS_TESTS and eks_tests_enabled: return True if test_type == constants.SANITY_TESTS and sanity_tests_enabled: return True return False
def main(): parser = argparse.ArgumentParser(description="Program to build docker images") parser.add_argument("--buildspec", type=str) parser.add_argument("--framework", type=str) parser.add_argument("--device_types", type=str, default=constants.ALL) parser.add_argument("--image_types", type=str, default=constants.ALL) parser.add_argument("--py_versions", type=str, default=constants.ALL) args = parser.parse_args() device_types = args.device_types.split(",") if not args.device_types == constants.ALL else args.device_types image_types = args.image_types.split(",") if not args.image_types == constants.ALL else args.image_types py_versions = args.py_versions.split(",") if not args.py_versions == constants.ALL else args.py_versions # create the empty json file for images build_context = os.getenv("BUILD_CONTEXT") ei_dedicated = os.getenv("EIA_DEDICATED") == "True" neuron_dedicated = os.getenv("NEURON_DEDICATED") == "True" # Get config value options frameworks_to_skip = parse_dlc_developer_configs("build", "skip_frameworks") ei_build_mode = parse_dlc_developer_configs("dev", "ei_mode") neuron_build_mode = parse_dlc_developer_configs("dev", "neuron_mode") # A general will work if in non-EI and non-NEURON mode and its framework not been disabled general_builder_enabled = ( not ei_dedicated and not neuron_dedicated and not ei_build_mode and not neuron_build_mode and args.framework not in frameworks_to_skip ) # An EI dedicated builder will work if in EI mode and its framework not been disabled ei_builder_enabled = ( ei_dedicated and ei_build_mode and args.framework not in frameworks_to_skip ) # A NEURON dedicated builder will work if in NEURON mode and its framework has not been disabled neuron_builder_enabled = ( neuron_dedicated and neuron_build_mode and args.framework not in frameworks_to_skip ) utils.write_to_json_file(constants.TEST_TYPE_IMAGES_PATH, {}) # A builder will always work if it is in non-PR context if general_builder_enabled or ei_builder_enabled or neuron_builder_enabled or build_context != "PR": utils.build_setup( args.framework, device_types=device_types, image_types=image_types, py_versions=py_versions, ) image_builder(args.buildspec)
def run_test_job(commit, codebuild_project, images_str=""): test_env_file = constants.TEST_ENV_PATH if not os.path.exists(test_env_file): raise FileNotFoundError( f"{test_env_file} not found. This is required to set test environment variables" f" for test jobs. Failing the build.") with open(test_env_file) as test_env_file: env_overrides = json.load(test_env_file) pr_num = os.getenv("CODEBUILD_SOURCE_VERSION") env_overrides.extend([ { "name": "DLC_IMAGES", "value": images_str, "type": "PLAINTEXT" }, { "name": "PR_NUMBER", "value": pr_num, "type": "PLAINTEXT" }, # USE_SCHEDULER is passed as an env variable here because it is more convenient to set this in # dlc_developer_config, compared to having another config file under dlc/tests/. { "name": "USE_SCHEDULER", "value": str(parse_dlc_developer_configs("test", "use_scheduler")), "type": "PLAINTEXT", }, { "name": "DISABLE_EFA_TESTS", "value": str(not parse_dlc_developer_configs("test", "efa_tests")), "type": "PLAINTEXT", }, ]) LOGGER.debug(f"env_overrides dict: {env_overrides}") client = boto3.client("codebuild") return client.start_build( projectName=codebuild_project, environmentVariablesOverride=env_overrides, sourceVersion=commit, )
def main(): args = get_args() partner_dev = parse_dlc_developer_configs("dev", "partner_developer", tomlfile=args.partner_toml) if partner_dev: LOGGER.info(f"PARTNER_DEVELOPER: {partner_dev.upper()}") LOGGER.info(f"PR_NUMBER: pr-{os.getenv('PR_NUMBER', '')}") LOGGER.info(f"COMMIT_ID: {os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}") test_trigger = os.getenv("TEST_TRIGGER") if test_trigger: LOGGER.info(f"TEST_TRIGGER: {test_trigger}")
def main(): parser = argparse.ArgumentParser( description="Program to build docker images") parser.add_argument("--buildspec", type=str) parser.add_argument("--framework", type=str) parser.add_argument("--device_types", type=str, default=constants.ALL) parser.add_argument("--image_types", type=str, default=constants.ALL) parser.add_argument("--py_versions", type=str, default=constants.ALL) args = parser.parse_args() device_types = args.device_types.split( ",") if not args.device_types == constants.ALL else args.device_types image_types = args.image_types.split( ",") if not args.image_types == constants.ALL else args.image_types py_versions = args.py_versions.split( ",") if not args.py_versions == constants.ALL else args.py_versions # create the empty json file for images build_context = os.getenv("BUILD_CONTEXT") ei_dedicated = os.getenv("EIA_DEDICATED", "false").lower() == "true" neuron_dedicated = os.getenv("NEURON_DEDICATED", "false").lower() == "true" graviton_dedicated = os.getenv("GRAVITON_DEDICATED", "false").lower() == "true" habana_dedicated = os.getenv("HABANA_DEDICATED", "false").lower() == "true" # Get config value options frameworks_to_skip = parse_dlc_developer_configs("build", "skip_frameworks") ei_build_mode = parse_dlc_developer_configs("dev", "ei_mode") neuron_build_mode = parse_dlc_developer_configs("dev", "neuron_mode") graviton_build_mode = parse_dlc_developer_configs("dev", "graviton_mode") habana_build_mode = parse_dlc_developer_configs("dev", "habana_mode") # Write empty dict to JSON file, so subsequent buildspec steps do not fail in case we skip this build utils.write_to_json_file(constants.TEST_TYPE_IMAGES_PATH, {}) # Skip tensorflow-1 PR jobs, as there are no longer patch releases being added for TF1 # Purposefully not including this in developer config to make this difficult to enable # TODO: Remove when we remove these jobs completely build_arn = utils.get_codebuild_build_arn() if build_context == "PR": tf_1_build_regex = re.compile(r"dlc-pr-tensorflow-1:") if tf_1_build_regex.search(build_arn): return # A general will work if in non-EI, non-NEURON and non-GRAVITON mode and its framework not been disabled general_builder_enabled = (not ei_dedicated and not neuron_dedicated and not graviton_dedicated and not habana_dedicated and not ei_build_mode and not neuron_build_mode and not graviton_build_mode and not habana_build_mode and args.framework not in frameworks_to_skip) # An EI dedicated builder will work if in EI mode and its framework not been disabled ei_builder_enabled = ei_dedicated and ei_build_mode and args.framework not in frameworks_to_skip # A NEURON dedicated builder will work if in NEURON mode and its framework has not been disabled neuron_builder_enabled = neuron_dedicated and neuron_build_mode and args.framework not in frameworks_to_skip # A GRAVITON dedicated builder will work if in GRAVITON mode and its framework has not been disabled graviton_builder_enabled = graviton_dedicated and graviton_build_mode and args.framework not in frameworks_to_skip # A HABANA dedicated builder will work if in HABANA mode and its framework has not been disabled habana_builder_enabled = habana_dedicated and habana_build_mode and args.framework not in frameworks_to_skip # A builder will always work if it is in non-PR context if (general_builder_enabled or ei_builder_enabled or neuron_builder_enabled or graviton_builder_enabled or habana_builder_enabled or build_context != "PR"): utils.build_setup( args.framework, device_types=device_types, image_types=image_types, py_versions=py_versions, ) image_builder(args.buildspec)
def image_builder(buildspec): BUILDSPEC = Buildspec() BUILDSPEC.load(buildspec) PRE_PUSH_STAGE_IMAGES = [] COMMON_STAGE_IMAGES = [] if "huggingface" in str(BUILDSPEC["framework"]) or "autogluon" in str( BUILDSPEC["framework"]) or "trcomp" in str(BUILDSPEC["framework"]): os.system("echo login into public ECR") os.system( "aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com" ) for image_name, image_config in BUILDSPEC["images"].items(): ARTIFACTS = deepcopy( BUILDSPEC["context"]) if BUILDSPEC.get("context") else {} extra_build_args = {} labels = {} enable_datetime_tag = parse_dlc_developer_configs( "build", "datetime_tag") if image_config.get("version") is not None: if BUILDSPEC["version"] != image_config.get("version"): continue if image_config.get("context") is not None: ARTIFACTS.update(image_config["context"]) image_tag = tag_image_with_pr_number( image_config["tag"] ) if build_context == "PR" else image_config["tag"] if enable_datetime_tag or build_context != "PR": image_tag = tag_image_with_datetime(image_tag) image_repo_uri = (image_config["repository"] if build_context == "PR" else modify_repository_name_for_context( str(image_config["repository"]), build_context)) base_image_uri = None if image_config.get("base_image_name") is not None: base_image_object = _find_image_object( PRE_PUSH_STAGE_IMAGES, image_config["base_image_name"]) base_image_uri = base_image_object.ecr_url if image_config.get("download_artifacts") is not None: for artifact_name, artifact in image_config.get( "download_artifacts").items(): type = artifact["type"] uri = artifact["URI"] var = artifact["VAR_IN_DOCKERFILE"] try: file_name = utils.download_file(uri, type).strip() except ValueError: FORMATTER.print( f"Artifact download failed: {uri} of type {type}.") ARTIFACTS.update({ f"{artifact_name}": { "source": f"{os.path.join(os.sep, os.path.abspath(os.getcwd()), file_name)}", "target": file_name, } }) extra_build_args[var] = file_name labels[var] = file_name labels[f"{var}_URI"] = uri transformers_version = image_config.get("transformers_version") if str(BUILDSPEC["framework"]).startswith("huggingface") or str( BUILDSPEC["framework"]).endswith("trcomp"): if transformers_version: extra_build_args["TRANSFORMERS_VERSION"] = transformers_version else: raise KeyError( f"HuggingFace buildspec.yml must contain 'transformers_version' field for each image" ) if "datasets_version" in image_config: extra_build_args["DATASETS_VERSION"] = image_config.get( "datasets_version") elif str(image_config["image_type"]) == "training": raise KeyError( f"HuggingFace buildspec.yml must contain 'datasets_version' field for each image" ) ARTIFACTS.update({ "dockerfile": { "source": image_config["docker_file"], "target": "Dockerfile", } }) context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"]) if "labels" in image_config: labels.update(image_config.get("labels")) cx_type = utils.get_label_prefix_customer_type(image_tag) # Define label variables label_framework = str(BUILDSPEC['framework']).replace('_', '-') if image_config.get("framework_version"): label_framework_version = str( image_config['framework_version']).replace('.', '-') else: label_framework_version = str(BUILDSPEC['version']).replace( '.', '-') label_device_type = str(image_config['device_type']) if label_device_type == "gpu": label_device_type = f"{label_device_type}.{str(image_config['cuda_version'])}" label_arch = str(BUILDSPEC['arch_type']) label_python_version = str(image_config['tag_python_version']) label_os_version = str(image_config.get('os_version')).replace( '.', '-') label_contributor = str(BUILDSPEC.get('contributor')) label_transformers_version = str(transformers_version).replace( '.', '-') # job_type will be either inference or training, based on the repo URI if "training" in image_repo_uri: label_job_type = "training" elif "inference" in image_repo_uri: label_job_type = "inference" else: raise RuntimeError( f"Cannot find inference or training job type in {image_repo_uri}. " f"This is required to set job_type label.") if cx_type == "sagemaker": # Adding standard labels to all images labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.framework.{label_framework}.{label_framework_version}"] = "true" labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.device.{label_device_type}"] = "true" labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.arch.{label_arch}"] = "true" # python version label will look like py_version.py36, for example labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.python.{label_python_version}"] = "true" labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.os.{label_os_version}"] = "true" labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.job.{label_job_type}"] = "true" if label_contributor: labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.contributor.{label_contributor}"] = "true" if transformers_version: labels[ f"com.amazonaws.ml.engines.{cx_type}.dlc.lib.transformers.{label_transformers_version}"] = "true" """ Override parameters from parent in child. """ info = { "account_id": str(BUILDSPEC["account_id"]), "region": str(BUILDSPEC["region"]), "framework": str(BUILDSPEC["framework"]), "version": str(BUILDSPEC["version"]), "root": str(image_config["root"]), "name": str(image_name), "device_type": str(image_config["device_type"]), "python_version": str(image_config["python_version"]), "image_type": str(image_config["image_type"]), "image_size_baseline": int(image_config["image_size_baseline"]), "base_image_uri": base_image_uri, "enable_test_promotion": image_config.get("enable_test_promotion", True), "labels": labels, "extra_build_args": extra_build_args, } # Create pre_push stage docker object pre_push_stage_image_object = DockerImage( info=info, dockerfile=image_config["docker_file"], repository=image_repo_uri, tag=append_tag(image_tag, "pre-push"), to_build=image_config["build"], stage=constants.PRE_PUSH_STAGE, context=context, additional_tags=[image_tag], target=image_config.get("target"), ) ##### Create Common stage docker object ##### # If for a pre_push stage image we create a common stage image, then we do not push the pre_push stage image # to the repository. Instead, we just push its common stage image to the repository. Therefore, # inside function get_common_stage_image_object we make pre_push_stage_image_object non pushable. common_stage_image_object = generate_common_stage_image_object( pre_push_stage_image_object, image_tag) COMMON_STAGE_IMAGES.append(common_stage_image_object) PRE_PUSH_STAGE_IMAGES.append(pre_push_stage_image_object) FORMATTER.separator() FORMATTER.banner("DLC") # Parent images do not inherit from any containers built in this job # Child images use one of the parent images as their base image parent_images = [ image for image in PRE_PUSH_STAGE_IMAGES if not image.is_child_image ] child_images = [ image for image in PRE_PUSH_STAGE_IMAGES if image.is_child_image ] ALL_IMAGES = PRE_PUSH_STAGE_IMAGES + COMMON_STAGE_IMAGES IMAGES_TO_PUSH = [ image for image in ALL_IMAGES if image.to_push and image.to_build ] pushed_images = [] pushed_images += process_images(parent_images, "Parent/Independent") pushed_images += process_images(child_images, "Child/Dependent") assert all(image in pushed_images for image in IMAGES_TO_PUSH), "Few images could not be pushed." # After the build, display logs/summary for all the images. FORMATTER.banner("Summary") show_build_info(ALL_IMAGES) FORMATTER.banner("Errors") is_any_build_failed, is_any_build_failed_size_limit = show_build_errors( ALL_IMAGES) # From all images, filter the images that were supposed to be built and upload their metrics BUILT_IMAGES = [image for image in ALL_IMAGES if image.to_build] FORMATTER.banner("Upload Metrics") upload_metrics(BUILT_IMAGES, BUILDSPEC, is_any_build_failed, is_any_build_failed_size_limit) FORMATTER.banner("Test Env") # Set environment variables to be consumed by test jobs test_trigger_job = get_codebuild_project_name() # Tests should only run on images that were pushed to the repository if not is_build_enabled(): # Ensure we have images populated if do_build is false, so that tests can proceed if needed images_to_test = [image for image in ALL_IMAGES if image.to_push] else: images_to_test = IMAGES_TO_PUSH utils.set_test_env( images_to_test, use_latest_additional_tag=True, BUILD_CONTEXT=os.getenv("BUILD_CONTEXT"), TEST_TRIGGER=test_trigger_job, )
def image_builder(buildspec): FORMATTER = OutputFormatter(constants.PADDING) BUILDSPEC = Buildspec() BUILDSPEC.load(buildspec) IMAGES = [] if "huggingface" in str(BUILDSPEC["framework"]): os.system("echo login into public ECR") os.system( "aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com" ) for image_name, image_config in BUILDSPEC["images"].items(): ARTIFACTS = deepcopy( BUILDSPEC["context"]) if BUILDSPEC.get("context") else {} extra_build_args = {} labels = {} enable_datetime_tag = parse_dlc_developer_configs( "build", "datetime_tag") if image_config.get("version") is not None: if BUILDSPEC["version"] != image_config.get("version"): continue if image_config.get("context") is not None: ARTIFACTS.update(image_config["context"]) build_context = os.getenv("BUILD_CONTEXT") image_tag = (tag_image_with_pr_number(image_config["tag"]) if build_context == "PR" else image_config["tag"]) if enable_datetime_tag or build_context != "PR": image_tag = tag_image_with_datetime(image_tag) image_repo_uri = (image_config["repository"] if build_context == "PR" else modify_repository_name_for_context( str(image_config["repository"]), build_context)) base_image_uri = None if image_config.get("base_image_name") is not None: base_image_object = _find_image_object( IMAGES, image_config["base_image_name"]) base_image_uri = base_image_object.ecr_url if image_config.get("download_artifacts") is not None: for artifact_name, artifact in image_config.get( "download_artifacts").items(): type = artifact["type"] uri = artifact["URI"] var = artifact["VAR_IN_DOCKERFILE"] try: file_name = utils.download_file(uri, type).strip() except ValueError: FORMATTER.print( f"Artifact download failed: {uri} of type {type}.") ARTIFACTS.update({ f"{artifact_name}": { "source": f"{os.path.join(os.sep, os.path.abspath(os.getcwd()), file_name)}", "target": file_name } }) extra_build_args[var] = file_name labels[var] = file_name labels[f"{var}_URI"] = uri if str(BUILDSPEC["framework"]).startswith("huggingface"): if "transformers_version" in image_config: extra_build_args["TRANSFORMERS_VERSION"] = image_config.get( "transformers_version") else: raise KeyError( f"HuggingFace buildspec.yml must contain 'transformers_version' field for each image" ) if "datasets_version" in image_config: extra_build_args["DATASETS_VERSION"] = image_config.get( "datasets_version") elif str(image_config["image_type"]) == "training": raise KeyError( f"HuggingFace buildspec.yml must contain 'datasets_version' field for each image" ) ARTIFACTS.update({ "dockerfile": { "source": image_config["docker_file"], "target": "Dockerfile", } }) context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"]) if "labels" in image_config: labels.update(image_config.get("labels")) """ Override parameters from parent in child. """ info = { "account_id": str(BUILDSPEC["account_id"]), "region": str(BUILDSPEC["region"]), "framework": str(BUILDSPEC["framework"]), "version": str(BUILDSPEC["version"]), "root": str(image_config["root"]), "name": str(image_name), "device_type": str(image_config["device_type"]), "python_version": str(image_config["python_version"]), "image_type": str(image_config["image_type"]), "image_size_baseline": int(image_config["image_size_baseline"]), "base_image_uri": base_image_uri, "labels": labels, "extra_build_args": extra_build_args } image_object = DockerImage( info=info, dockerfile=image_config["docker_file"], repository=image_repo_uri, tag=image_tag, to_build=image_config["build"], context=context, ) IMAGES.append(image_object) FORMATTER.banner("DLC") FORMATTER.title("Status") THREADS = {} # In the context of the ThreadPoolExecutor each instance of image.build submitted # to it is executed concurrently in a separate thread. with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Standard images must be built before example images # Example images will use standard images as base standard_images = [ image for image in IMAGES if "example" not in image.name.lower() ] example_images = [ image for image in IMAGES if "example" in image.name.lower() ] for image in standard_images: THREADS[image.name] = executor.submit(image.build) # the FORMATTER.progress(THREADS) function call also waits until all threads have completed FORMATTER.progress(THREADS) for image in example_images: THREADS[image.name] = executor.submit(image.build) # the FORMATTER.progress(THREADS) function call also waits until all threads have completed FORMATTER.progress(THREADS) FORMATTER.title("Build Logs") if not os.path.isdir("logs"): os.makedirs("logs") for image in IMAGES: FORMATTER.title(image.name) FORMATTER.table(image.info.items()) FORMATTER.separator() FORMATTER.print_lines(image.log) with open(f"logs/{image.name}", "w") as fp: fp.write("/n".join(image.log)) image.summary["log"] = f"logs/{image.name}" FORMATTER.title("Summary") for image in IMAGES: FORMATTER.title(image.name) FORMATTER.table(image.summary.items()) FORMATTER.title("Errors") is_any_build_failed = False is_any_build_failed_size_limit = False for image in IMAGES: if image.build_status == constants.FAIL: FORMATTER.title(image.name) FORMATTER.print_lines(image.log[-10:]) is_any_build_failed = True else: if image.build_status == constants.FAIL_IMAGE_SIZE_LIMIT: is_any_build_failed_size_limit = True if is_any_build_failed: raise Exception("Build failed") else: if is_any_build_failed_size_limit: FORMATTER.print("Build failed. Image size limit breached.") else: FORMATTER.print("No errors") FORMATTER.title("Uploading Metrics") metrics = Metrics( context=constants.BUILD_CONTEXT, region=BUILDSPEC["region"], namespace=constants.METRICS_NAMESPACE, ) for image in IMAGES: try: metrics.push_image_metrics(image) except Exception as e: if is_any_build_failed or is_any_build_failed_size_limit: raise Exception(f"Build failed.{e}") else: raise Exception(f"Build passed. {e}") if is_any_build_failed_size_limit: raise Exception("Build failed because of file limit") FORMATTER.separator() # Set environment variables to be consumed by test jobs test_trigger_job = utils.get_codebuild_project_name() utils.set_test_env( IMAGES, BUILD_CONTEXT=os.getenv("BUILD_CONTEXT"), TEST_TRIGGER=test_trigger_job, )