def __init__( self, file_system_id, file_system_type, directory_path, num_records, feature_dim, file_system_access_mode="ro", channel="train", ): """Initialize a ``FileSystemRecordSet`` object. Args: file_system_id (str): An Amazon file system ID starting with 'fs-'. file_system_type (str): The type of file system used for the input. Valid values: 'EFS', 'FSxLustre'. directory_path (str): Absolute or normalized path to the root directory (mount point) in the file system. Reference: https://docs.aws.amazon.com/efs/latest/ug/mounting-fs.html and https://docs.aws.amazon.com/efs/latest/ug/wt1-test.html num_records (int): The number of records in the set. feature_dim (int): The dimensionality of "values" arrays in the Record features, and label (if each Record is labeled). file_system_access_mode (str): Permissions for read and write. Valid values: 'ro' or 'rw'. Defaults to 'ro'. channel (str): The SageMaker Training Job channel this RecordSet should be bound to """ self.file_system_input = FileSystemInput( file_system_id, file_system_type, directory_path, file_system_access_mode ) self.feature_dim = feature_dim self.num_records = num_records self.channel = channel
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PY_VERSION, subnets=subnets, security_group_ids=security_group_ids, ) file_system_efs_id = efs_fsx_setup["file_system_efs_id"] content_type = "application/json" file_system_input = FileSystemInput( file_system_id=file_system_efs_id, file_system_type="EFS", directory_path=EFS_DIR_PATH, content_type=content_type, ) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-efs")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
def test_file_system_input_content_type(): file_system_id = "fs-0a48d2a1" file_system_type = "FSxLustre" directory_path = "tensorflow" file_system_access_mode = "rw" content_type = "application/json" actual = FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, file_system_access_mode=file_system_access_mode, content_type=content_type, ) expected = { "DataSource": { "FileSystemDataSource": { "FileSystemId": file_system_id, "FileSystemType": file_system_type, "DirectoryPath": directory_path, "FileSystemAccessMode": "rw", } }, "ContentType": content_type, } assert actual.config == expected
def launch_sagemaker_job( hyperparameters: Dict[str, Any], job_name: str, source_dir: str, entry_point: str, instance_type: str, instance_count: int, role: str, image_name: str, fsx_id: str, fsx_mount_name: str, subnet_ids: List[str], security_group_ids: List[str], ) -> None: """ Create a SageMaker job connected to FSx and Horovod. """ assert fsx_mount_name[ 0] != "/", "fsx_mount_name should not start with a '/'" hvd_processes_per_host = { "ml.p3dn.24xlarge": 8, "ml.p3.16xlarge": 8, "ml.g4dn.12xlarge": 4, }[instance_type] distributions = { "mpi": { "enabled": True, "processes_per_host": hvd_processes_per_host, "custom_mpi_options": "-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none", } } # Create FSx input fsx_input = FileSystemInput( file_system_id=fsx_id, file_system_type="FSxLustre", directory_path=f"/{fsx_mount_name}", file_system_access_mode="rw", ) # Create the job template estimator_hvd = TensorFlow( base_job_name=job_name, entry_point=entry_point, source_dir=source_dir, role=role, framework_version="2.1.0", py_version="py3", hyperparameters=hyperparameters, train_instance_count=instance_count, train_instance_type=instance_type, distributions=distributions, image_name=image_name, subnets=subnet_ids, security_group_ids=security_group_ids, enable_sagemaker_metrics=True, train_max_run=2419200, ) # Launch the job estimator_hvd.fit(fsx_input)
def test_format_string_uri_file_system_input(): file_system_id = "fs-fd85e556" file_system_type = "EFS" directory_path = "ipinsights" file_system_input = FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, ) uri_input = _Job._format_string_uri_input(file_system_input) assert uri_input == file_system_input
def test_file_system_input_type_invalid(): with pytest.raises(ValueError) as excinfo: file_system_id = "fs-0a48d2a1" file_system_type = "ABC" directory_path = "tensorflow" FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, ) assert str( excinfo.value ) == "Unrecognized file system type: ABC. Valid values: FSxLustre, EFS."
def test_tuning_tf_lustre( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} objective_metric_name = "accuracy" metric_definitions = [{ "Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)" }] tuner = HyperparameterTuner( estimator, objective_metric_name, hyperparameter_ranges, metric_definitions, max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] file_system_input = FileSystemInput(file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH) with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuning_job_name = unique_name_from_base( "test-tuning-tf-script-mode-lustre", max_length=32) tuner.fit(file_system_input, job_name=tuning_job_name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_file_system_input_mode_invalid(): with pytest.raises(ValueError) as excinfo: file_system_id = "fs-0a48d2a1" file_system_type = "EFS" directory_path = "tensorflow" file_system_access_mode = "p" FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, file_system_access_mode=file_system_access_mode, ) assert str( excinfo.value ) == "Unrecognized file system access mode: p. Valid values: ro, rw."
def launch_sagemaker_job( job_name: str, source_dir: str, entry_point: str, instance_type: str, instance_count: int, hyperparameters: Dict[str, Any], ) -> None: """ Create a SageMaker job connected to FSx and Horovod. """ hvd_processes_per_host = {"ml.p3dn.24xlarge": 8, "ml.p3.16xlarge": 8, "ml.g4dn.12xlarge": 4,}[ instance_type ] distributions = { "mpi": { "enabled": True, "processes_per_host": hvd_processes_per_host, "custom_mpi_options": "-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none", } } # Create FSx input fsx_input = FileSystemInput( file_system_id=FSX_ID, file_system_type="FSxLustre", directory_path="/fsx", file_system_access_mode="rw", ) # Create the job template estimator_hvd = TensorFlow( base_job_name=job_name, entry_point=entry_point, source_dir=source_dir, role=ROLE, framework_version="2.1.0", py_version="py3", hyperparameters=hyperparameters, train_instance_count=instance_count, train_instance_type=instance_type, distributions=distributions, image_name=IMAGE_NAME, subnets=SUBNETS, security_group_ids=SECURITY_GROUP_IDS, enable_sagemaker_metrics=True, ) # Launch the job estimator_hvd.fit(fsx_input)
def test_file_system_input_default_access_mode(): file_system_id = "fs-0a48d2a1" file_system_type = "EFS" directory_path = "tensorflow" actual = FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, ) expected = { "DataSource": { "FileSystemDataSource": { "FileSystemId": file_system_id, "FileSystemType": file_system_type, "DirectoryPath": directory_path, "FileSystemAccessMode": "ro", } } } assert actual.config == expected
def test_mnist_efs( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) file_system_efs_id = efs_fsx_setup["file_system_efs_id"] content_type = "application/json" file_system_input = FileSystemInput( file_system_id=file_system_efs_id, file_system_type="EFS", directory_path=EFS_DIR_PATH, content_type=content_type, ) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-efs")) assert_s3_file_patterns_exist( sagemaker_session, estimator.model_dir, [r"model\.ckpt-\d+\.index", r"checkpoint"], )
def test_file_system_input_all_arguments(): file_system_id = "fs-0a48d2a1" file_system_type = "FSxLustre" directory_path = "tensorflow" file_system_access_mode = "rw" actual = FileSystemInput( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, file_system_access_mode=file_system_access_mode, ) expected = { "DataSource": { "FileSystemDataSource": { "FileSystemId": file_system_id, "FileSystemType": file_system_type, "DirectoryPath": directory_path, "FileSystemAccessMode": "rw", } } } assert actual.config == expected
def test_mnist_lustre( efs_fsx_setup, sagemaker_session, cpu_instance_type, tensorflow_training_latest_version, tensorflow_training_latest_py_version, ): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] estimator = TensorFlow( entry_point=SCRIPT, role=role, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, framework_version=tensorflow_training_latest_version, py_version=tensorflow_training_latest_py_version, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] file_system_input = FileSystemInput(file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit(inputs=file_system_input, job_name=unique_name_from_base("test-mnist-lustre")) assert_s3_files_exist( sagemaker_session, estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"], )
region = get_str("echo $(aws configure get region)") image = str(sys.argv[1]) sess = sage.Session() image_name=f"{account}.dkr.ecr.{region}.amazonaws.com/{image}" sagemaker_iam_role = str(sys.argv[2]) num_gpus = 8 num_nodes = 4 instance_type = 'ml.p3.16xlarge' custom_mpi_cmds = [] job_name = "maskrcnn-{}x{}-{}".format(num_nodes, num_gpus, image) output_path = 's3://mrcnn-sagemaker/sagemaker_training_release' lustre_input = FileSystemInput(file_system_id='fs-03f556d03c3c590a2', file_system_type='FSxLustre', directory_path='/fsx', file_system_access_mode='ro') hyperparams = {"sagemaker_use_mpi": "True", "sagemaker_process_slots_per_host": num_gpus, "num_gpus":num_gpus, "num_nodes": num_nodes, "custom_mpi_cmds": custom_mpi_cmds} estimator = Estimator(image_name, role=sagemaker_iam_role, output_path=output_path, train_instance_count=num_nodes, train_instance_type=instance_type, sagemaker_session=sess, train_volume_size=200, base_job_name=job_name, subnets=['subnet-21ac2f2e'],
def handler(event, context): trainId = event['trainId'] useSpotArg = event['useSpot'] useSpot = True if useSpotArg.lower() == 'false': useSpot = False uniqueId = su.uuid() trainingConfigurationClient = bioims.client('training-configuration') trainInfo = trainingConfigurationClient.getTraining(trainId) embeddingName = trainInfo['embeddingName'] embeddingInfo = trainingConfigurationClient.getEmbeddingInfo(embeddingName) trainScriptBucket = embeddingInfo['modelTrainingScriptBucket'] trainScriptKey = embeddingInfo['modelTrainingScriptKey'] localTrainingScript = '/tmp/bioims-training-script.py' getS3TextObjectWriteToPath(trainScriptBucket, trainScriptKey, localTrainingScript) trainListArtifactKey = bp.getTrainImageListArtifactPath(trainId) sagemaker_session = sagemaker.Session() sagemaker_bucket = sagemaker_session.default_bucket() sagemaker_role = sagemaker.get_execution_role() py_version = '1.6.0' instance_type = embeddingInfo['trainingInstanceType'] trainingHyperparameters = embeddingInfo['trainingHyperparameters'] fsxInfo = getFsxInfo() print(fsxInfo) directory_path = '/' + fsxInfo['mountName'] sgIds = [] sgIds.append(fsxInfo['securityGroup']) jobName = 'bioims-' + trainId + '-' + uniqueId checkpoint_s3_uri = "s3://" + sagemaker_bucket + "/checkpoints/" + jobName file_system_input = FileSystemInput(file_system_id=fsxInfo['fsxId'], file_system_type='FSxLustre', directory_path=directory_path, file_system_access_mode='ro') trainingHyperparameters['train_list_file'] = trainListArtifactKey if useSpot: estimator = PyTorch( entry_point=localTrainingScript, role=sagemaker_role, framework_version=py_version, instance_count=1, instance_type=instance_type, py_version='py36', image_name= '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04', subnets=fsxInfo['subnetIds'], security_group_ids=sgIds, hyperparameters=trainingHyperparameters, train_use_spot_instances=True, train_max_wait=100000, train_max_run=100000, checkpoint_s3_uri=checkpoint_s3_uri, debugger_hook_config=False) else: estimator = PyTorch( entry_point=localTrainingScript, role=sagemaker_role, framework_version=py_version, instance_count=1, instance_type=instance_type, py_version='py36', image_name= '763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04', subnets=fsxInfo['subnetIds'], security_group_ids=sgIds, hyperparameters=trainingHyperparameters, train_use_spot_instances=False, checkpoint_s3_uri=checkpoint_s3_uri, debugger_hook_config=False) trainingConfigurationClient.updateTraining(trainId, 'sagemakerJobName', jobName) estimator.fit(file_system_input, wait=False, job_name=jobName) responseInfo = {'trainingJobName': jobName} response = {'statusCode': 200, 'body': responseInfo} return response