コード例 #1
0
def test_mnist_distributed(
    sagemaker_session,
    instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=ROLE,
        instance_count=2,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        distribution=PARAMETER_SERVER_DISTRIBUTION,
        disable_profiler=True,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/distributed_mnist")

    with tests.integ.timeout.timeout(
            minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs,
                      job_name=unique_name_from_base("test-tf-sm-distributed"))
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
コード例 #2
0
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        py_version=PY_VERSION,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_efs_id = efs_fsx_setup["file_system_efs_id"]
    content_type = "application/json"
    file_system_input = FileSystemInput(
        file_system_id=file_system_efs_id,
        file_system_type="EFS",
        directory_path=EFS_DIR_PATH,
        content_type=content_type,
    )
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-efs"))

    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
コード例 #3
0
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        role = efs_fsx_setup["role_name"]
        subnets = [efs_fsx_setup["subnet_id"]]
        security_group_ids = efs_fsx_setup["security_group_ids"]
        kmeans = KMeans(
            role=role,
            instance_count=INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=K,
            sagemaker_session=sagemaker_session,
            subnets=subnets,
            security_group_ids=security_group_ids,
        )

        file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
        records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        job_name = unique_name_from_base("kmeans-fsx")
        kmeans.fit(records, job_name=job_name)
        model_path, _ = kmeans.model_data.rsplit("/", 1)
        assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
コード例 #4
0
def test_mnist(sagemaker_session, instance_type):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        py_version=tests.integ.PYTHON_VERSION,
        metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
    )

    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist"))
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
    df = estimator.training_job_analytics.dataframe()
    assert df.size > 0
コード例 #5
0
def test_mnist_with_checkpoint_config(
    sagemaker_session,
    instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format(
        sagemaker_session.default_bucket(), sagemaker_timestamp()
    )
    checkpoint_local_path = "/test/checkpoint/path"
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        instance_count=1,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
        checkpoint_s3_uri=checkpoint_s3_uri,
        checkpoint_local_path=checkpoint_local_path,
        environment=ENV_INPUT,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
    )

    training_job_name = unique_name_from_base("test-tf-sm-mnist")
    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=training_job_name)
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
    # remove dataframe assertion to unblock PR build
    # TODO: add independent integration test for `training_job_analytics`

    expected_training_checkpoint_config = {
        "S3Uri": checkpoint_s3_uri,
        "LocalPath": checkpoint_local_path,
    }
    actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=training_job_name
    )["CheckpointConfig"]
    actual_training_environment_variable_config = (
        sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)[
            "Environment"
        ]
    )
    assert actual_training_checkpoint_config == expected_training_checkpoint_config
    assert actual_training_environment_variable_config == ENV_INPUT
コード例 #6
0
def test_mnist_with_checkpoint_config(sagemaker_session, instance_type,
                                      tf_full_version, py_version):
    checkpoint_s3_uri = "s3://{}/checkpoints/tf-{}".format(
        sagemaker_session.default_bucket(), sagemaker_timestamp())
    checkpoint_local_path = "/test/checkpoint/path"
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=tf_full_version,
        py_version=py_version,
        metric_definitions=[{
            "Name": "train:global_steps",
            "Regex": r"global_step\/sec:\s(.*)"
        }],
        checkpoint_s3_uri=checkpoint_s3_uri,
        checkpoint_local_path=checkpoint_local_path,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/mnist")

    training_job_name = unique_name_from_base("test-tf-sm-mnist")
    with tests.integ.timeout.timeout(
            minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=training_job_name)
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
    df = estimator.training_job_analytics.dataframe()
    assert df.size > 0

    expected_training_checkpoint_config = {
        "S3Uri": checkpoint_s3_uri,
        "LocalPath": checkpoint_local_path,
    }
    actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job(
        TrainingJobName=training_job_name)["CheckpointConfig"]
    assert actual_training_checkpoint_config == expected_training_checkpoint_config
コード例 #7
0
def test_mnist_lustre(
    efs_fsx_setup,
    sagemaker_session,
    cpu_instance_type,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    role = efs_fsx_setup["role_name"]
    subnets = [efs_fsx_setup["subnet_id"]]
    security_group_ids = efs_fsx_setup["security_group_ids"]

    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=role,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
    file_system_input = FileSystemInput(file_system_id=file_system_fsx_id,
                                        file_system_type="FSxLustre",
                                        directory_path=FSX_DIR_PATH)

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=file_system_input,
                      job_name=unique_name_from_base("test-mnist-lustre"))
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )