コード例 #1
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs")
    lib = os.path.join(str(tmpdir), "alexa.py")

    with open(lib, "w") as f:
        f.write("def question(to_anything): return 42")

    # TODO: fails on newer versions of pytorch in call to np.load(BytesIO(stream.read()))
    # "ValueError: Cannot load file containing pickled data when allow_pickle=False"
    estimator = PyTorch(
        entry_point="train.py",
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=[lib],
        framework_version=
        "0.4",  # hard-code to last known good pytorch for now (see TODO above)
        py_version="py3",
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )
    estimator.fit()

    # endpoint tests all use the same port, so we use this lock to prevent concurrent execution
    with lock.lock():
        try:
            predictor = estimator.deploy(initial_instance_count=1,
                                         instance_type="local")
            predict_response = predictor.predict([7])
            assert predict_response == [49]
        finally:
            predictor.delete_endpoint()
コード例 #2
0
def test_github(sagemaker_local_session):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "pytorch_mnist")
    git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT}
    pytorch = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir="pytorch",
        framework_version=PYTORCH_VERSION,
        py_version=PYTHON_VERSION,
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    pytorch.fit({
        "training":
        "file://" + os.path.join(data_path, "training", MNIST_FOLDER_NAME)
    })

    with lock.lock(LOCK_PATH):
        try:
            predictor = pytorch.deploy(initial_instance_count=1,
                                       instance_type="local")
            data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
コード例 #3
0
def test_github(sagemaker_local_session, pytorch_inference_latest_version,
                pytorch_inference_latest_py_version):
    script_path = "mnist.py"
    git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT}

    pytorch = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir="pytorch",
        framework_version=pytorch_inference_latest_version,
        py_version=pytorch_inference_latest_py_version,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        git_config=git_config,
    )

    data_path = os.path.join(DATA_DIR, "pytorch_mnist")
    pytorch.fit({"training": "file://" + os.path.join(data_path, "training")})

    with lock.lock(LOCK_PATH):
        try:
            predictor = pytorch.deploy(initial_instance_count=1,
                                       instance_type="local")
            data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
            result = predictor.predict(data)
            assert 10 == len(
                result[0])  # check that there is a probability for each label
        finally:
            predictor.delete_endpoint()
コード例 #4
0
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided(
    sagemaker_session,
    cpu_instance_type,
    pytorch_inference_latest_version,
    pytorch_inference_latest_py_version,
):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        estimator = PyTorch(
            entry_point=PYTORCH_MNIST_SCRIPT,
            role=ROLE,
            framework_version=pytorch_inference_latest_version,
            py_version=pytorch_inference_latest_py_version,
            instance_count=2,
            instance_type=cpu_instance_type,
            hyperparameters={
                "epochs": 6,
                "backend": "gloo"
            },
            sagemaker_session=sagemaker_session,
        )

        training_config = _build_airflow_workflow(
            estimator=estimator, instance_type=cpu_instance_type)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["HyperParameters"]
            ["sagemaker_submit_directory"].strip('"'),
        )
コード例 #5
0
def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
    training_job_name = ""
    # TODO: add tests against local mode when it's ready to be used
    instance_type = 'ml.p2.xlarge'

    with timeout(minutes=10):
        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)

        pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False)
        training_job_name = pytorch.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    if not _is_local_mode(instance_type):
        endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(sagemaker_timestamp())

        with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
            print("Re-attaching now to: %s" % training_job_name)
            estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
            predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name)

            batch_size = 100
            data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
            output = predictor.predict(data)

            assert output.shape == (batch_size, 10)
コード例 #6
0
def _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type='ml.c4.xlarge',
                           entry_point=MNIST_SCRIPT):
    return PyTorch(entry_point=entry_point, role='SageMakerRole',
                   framework_version=pytorch_full_version,
                   py_version=PYTHON_VERSION, train_instance_count=1,
                   train_instance_type=instance_type,
                   sagemaker_session=sagemaker_session)
コード例 #7
0
def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
    training_job_name = ""
    # TODO: add tests against local mode when it's ready to be used
    instance_type = 'ml.p2.xlarge'

    with timeout(minutes=10):
        pytorch = _get_pytorch_estimator(sagemaker_session,
                                         pytorch_full_version, instance_type)

        pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False)
        training_job_name = pytorch.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" %
              training_job_name)
        time.sleep(20)

    if not _is_local_mode(instance_type):
        endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(
            sagemaker_timestamp())

        with timeout_and_delete_endpoint_by_name(endpoint_name,
                                                 sagemaker_session):
            print("Re-attaching now to: %s" % training_job_name)
            estimator = PyTorch.attach(training_job_name=training_job_name,
                                       sagemaker_session=sagemaker_session)
            predictor = estimator.deploy(1,
                                         instance_type,
                                         endpoint_name=endpoint_name)

            batch_size = 100
            data = numpy.random.rand(batch_size, 1, 28,
                                     28).astype(numpy.float32)
            output = predictor.predict(data)

            assert output.shape == (batch_size, 10)
コード例 #8
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, 'pytorch_source_dirs')
    lib = os.path.join(str(tmpdir), 'alexa.py')

    with open(lib, 'w') as f:
        f.write('def question(to_anything): return 42')

    estimator = PyTorch(entry_point='train.py',
                        role='SageMakerRole',
                        source_dir=source_dir,
                        dependencies=[lib],
                        py_version=PYTHON_VERSION,
                        train_instance_count=1,
                        train_instance_type='local',
                        sagemaker_session=sagemaker_local_session)
    try:

        estimator.fit()

        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='local')

        predict_response = predictor.predict([7])

        assert predict_response == [49]
    finally:
        estimator.delete_endpoint()
コード例 #9
0
def test_source_dirs(tmpdir, sagemaker_local_session):
    source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs")
    lib = os.path.join(str(tmpdir), "alexa.py")

    with open(lib, "w") as f:
        f.write("def question(to_anything): return 42")

    estimator = PyTorch(
        entry_point="train.py",
        role="SageMakerRole",
        source_dir=source_dir,
        dependencies=[lib],
        py_version=PYTHON_VERSION,
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )
    estimator.fit()

    # endpoint tests all use the same port, so we use this lock to prevent concurrent execution
    with lock.lock():
        try:
            predictor = estimator.deploy(initial_instance_count=1, instance_type="local")
            predict_response = predictor.predict([7])
            assert predict_response == [49]
        finally:
            estimator.delete_endpoint()
コード例 #10
0
def test_tuning_step_with_single_algo_tuner(pipeline_session, entry_point):
    inputs = TrainingInput(
        s3_data=f"s3://{pipeline_session.default_bucket()}/training-data")

    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=sagemaker.get_execution_role(),
        framework_version="1.5.0",
        py_version="py3",
        instance_count=1,
        instance_type="ml.m5.xlarge",
        sagemaker_session=pipeline_session,
        enable_sagemaker_metrics=True,
        max_retry_attempts=3,
    )

    hyperparameter_ranges = {
        "batch-size": IntegerParameter(64, 128),
    }

    tuner = HyperparameterTuner(
        estimator=pytorch_estimator,
        objective_metric_name="test:acc",
        objective_type="Maximize",
        hyperparameter_ranges=hyperparameter_ranges,
        metric_definitions=[{
            "Name": "test:acc",
            "Regex": "Overall test accuracy: (.*?);"
        }],
        max_jobs=2,
        max_parallel_jobs=2,
    )

    with warnings.catch_warnings(record=True) as w:
        step_args = tuner.fit(inputs=inputs)
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Running within a PipelineSession" in str(w[-1].message)

    with warnings.catch_warnings(record=True) as w:
        step = TuningStep(
            name="MyTuningStep",
            step_args=step_args,
        )
        assert len(w) == 0

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )

    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": step_args,
    }
コード例 #11
0
def test_jumpstart_catboost_image_uri(patched_get_model_specs, session):

    patched_get_model_specs.side_effect = get_prototype_model_spec

    model_id, model_version = "catboost-classification-model", "*"
    instance_type = "ml.p2.xlarge"
    region = "us-west-2"

    model_specs = accessors.JumpStartModelsAccessor.get_model_specs(
        region, model_id, model_version)

    # inference
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = PyTorchModel(
        role="mock_role",
        model_data="mock_data",
        entry_point="mock_entry_point",
        framework_version=model_specs.hosting_ecr_specs.framework_version,
        py_version=model_specs.hosting_ecr_specs.py_version,
        sagemaker_session=session,
    ).serving_image_uri(region, instance_type)

    assert uri == framework_class_uri
    assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.9.0-gpu-py38"

    # training
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="training",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = PyTorch(
        role="mock_role",
        entry_point="mock_entry_point",
        framework_version=model_specs.training_ecr_specs.framework_version,
        py_version=model_specs.training_ecr_specs.py_version,
        instance_type=instance_type,
        instance_count=1,
        sagemaker_session=session,
    ).training_image_uri(region=region)

    assert uri == framework_class_uri
    assert uri == "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-gpu-py38"
コード例 #12
0
def _get_pytorch_estimator(
    sagemaker_session, pytorch_version, py_version, instance_type, entry_point=MNIST_SCRIPT
):
    return PyTorch(
        entry_point=entry_point,
        role="SageMakerRole",
        framework_version=pytorch_version,
        py_version=py_version,
        instance_count=1,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
    )
コード例 #13
0
def test_pytorch_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        estimator = PyTorch(
            entry_point=PYTORCH_MNIST_SCRIPT,
            role=ROLE,
            framework_version="1.1.0",
            train_instance_count=2,
            train_instance_type=cpu_instance_type,
            hyperparameters={
                "epochs": 6,
                "backend": "gloo"
            },
        )

        train_config = sm_airflow.training_config(estimator=estimator)

        uploaded_s3_data = train_config["HyperParameters"][
            "sagemaker_submit_directory"].strip('"')

        transform_config = sm_airflow.transform_config_from_estimator(
            estimator=estimator,
            task_id="transform_config",
            task_type="training",
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            data=uploaded_s3_data,
            content_type="text/csv",
        )

        default_args = {
            "owner": "airflow",
            "start_date": airflow.utils.dates.days_ago(2),
            "provide_context": True,
        }

        dag = DAG("tensorflow_example",
                  default_args=default_args,
                  schedule_interval="@once")

        train_op = SageMakerTrainingOperator(task_id="tf_training",
                                             config=train_config,
                                             wait_for_completion=True,
                                             dag=dag)

        transform_op = SageMakerTransformOperator(task_id="transform_operator",
                                                  config=transform_config,
                                                  wait_for_completion=True,
                                                  dag=dag)

        transform_op.set_upstream(train_op)

        _assert_that_s3_url_contains_data(sagemaker_session, uploaded_s3_data)
コード例 #14
0
def test_fit_deploy(sagemaker_local_session, pytorch_full_version):
    pytorch = PyTorch(
        entry_point=MNIST_SCRIPT,
        role="SageMakerRole",
        framework_version=pytorch_full_version,
        py_version="py3",
        train_instance_count=1,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
    )

    pytorch.fit({"training": "file://" + os.path.join(MNIST_DIR, "training")})

    predictor = pytorch.deploy(1, "local")
    try:
        batch_size = 100
        data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
    finally:
        predictor.delete_endpoint()
コード例 #15
0
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session):
    # TODO: add tests against local mode when it's ready to be used
    endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp())
    with timeout(minutes=20):
        estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32)
        predictor.predict(data)

        batch_size = 100
        data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
コード例 #16
0
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session, cpu_instance_type):
    # TODO: add tests against local mode when it's ready to be used
    endpoint_name = "test-pytorch-sync-fit-attach-deploy{}".format(sagemaker_timestamp())
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32)
        predictor.predict(data)

        batch_size = 100
        data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
コード例 #17
0
from ...sm_utils import get_sm_execution_role

ON_SAGEMAKER_NOTEBOOK = False


# preparation
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = sess.default_bucket()  # this could also be a hard-coded bucket name
print('Using bucket ' + bucket)
sm_role = get_sm_execution_role(ON_SAGEMAKER_NOTEBOOK, region)
trainpath = 's3://sagemaker-ap-southeast-2-454979696062/sagemaker/sklearncontainer/adult.csv'
pytorch_estimator = PyTorch(
    entry_point='train.py',
    source_dir=os.path.abspath(os.path.dirname(__file__)),
    role = sm_role,
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version='1.5.0',
    base_job_name='fastai-pytorch',
    metric_definitions=[
        {'Name': 'Dice accuracy',
         'Regex': "Dice accuracy: ([0-9.]+).*$"}],
    hyperparameters = {'hidden_layer_1': 200,
                        'hidden_layer_2': 100})

# launch training job, with asynchronous call
pytorch_estimator.fit({'train':trainpath}, wait=False)
コード例 #18
0
def test_training_job_with_debugger_and_profiler(
    sagemaker_session,
    pipeline_name,
    role,
    pytorch_training_latest_version,
    pytorch_training_latest_py_version,
):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")

    rules = [
        Rule.sagemaker(rule_configs.vanishing_gradient()),
        Rule.sagemaker(base_config=rule_configs.all_zero(),
                       rule_parameters={"tensor_regex": ".*"}),
        Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ]
    debugger_hook_config = DebuggerHookConfig(
        s3_output_path=
        f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors")

    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    script_path = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    pytorch_estimator = PyTorch(
        entry_point=script_path,
        role="SageMakerRole",
        framework_version=pytorch_training_latest_version,
        py_version=pytorch_training_latest_py_version,
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        rules=rules,
        debugger_hook_config=debugger_hook_config,
    )

    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_count, instance_type],
        steps=[step_train],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]

        execution = pipeline.start()
        response = execution.describe()
        assert response["PipelineArn"] == create_arn

        try:
            execution.wait(delay=10, max_attempts=60)
        except WaiterError:
            pass
        execution_steps = execution.list_steps()

        assert len(execution_steps) == 1
        assert execution_steps[0].get("FailureReason", "") == ""
        assert execution_steps[0]["StepName"] == "pytorch-train"
        assert execution_steps[0]["StepStatus"] == "Succeeded"

        training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
        job_description = sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=training_job_arn.split("/")[1])

        for index, rule in enumerate(rules):
            config = job_description["DebugRuleConfigurations"][index]
            assert config["RuleConfigurationName"] == rule.name
            assert config["RuleEvaluatorImage"] == rule.image_uri
            assert config["VolumeSizeInGB"] == 0
            assert (config["RuleParameters"]["rule_to_invoke"] ==
                    rule.rule_parameters["rule_to_invoke"])
        assert job_description[
            "DebugHookConfig"] == debugger_hook_config._to_request_dict()

        assert job_description["ProfilingStatus"] == "Enabled"
        assert job_description["ProfilerConfig"][
            "ProfilingIntervalInMilliseconds"] == 500
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
コード例 #19
0
def test_model_registration_with_model_repack(
    sagemaker_session,
    role,
    pipeline_name,
    region_name,
):
    base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
    entry_point = os.path.join(base_dir, "mnist.py")
    input_path = sagemaker_session.upload_data(
        path=os.path.join(base_dir, "training"),
        key_prefix="integ-test-data/pytorch_mnist/training",
    )
    inputs = TrainingInput(s3_data=input_path)

    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")
    good_enough_input = ParameterInteger(name="GoodEnoughInput",
                                         default_value=1)

    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=role,
        framework_version="1.5.0",
        py_version="py3",
        instance_count=instance_count,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
    )
    step_train = TrainingStep(
        name="pytorch-train",
        estimator=pytorch_estimator,
        inputs=inputs,
    )

    step_register = RegisterModel(
        name="pytorch-register-model",
        estimator=pytorch_estimator,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["*"],
        response_types=["*"],
        inference_instances=["*"],
        transform_instances=["*"],
        description="test-description",
        entry_point=entry_point,
    )

    model = Model(
        image_uri=pytorch_estimator.training_image_uri(),
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    model_inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_model = CreateModelStep(
        name="pytorch-model",
        model=model,
        inputs=model_inputs,
    )

    step_cond = ConditionStep(
        name="cond-good-enough",
        conditions=[
            ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1)
        ],
        if_steps=[step_train, step_register],
        else_steps=[step_model],
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[good_enough_input, instance_count, instance_type],
        steps=[step_cond],
        sagemaker_session=sagemaker_session,
    )

    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn)

        execution = pipeline.start(parameters={})
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )

        execution = pipeline.start(parameters={"GoodEnoughInput": 0})
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
            execution.arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
コード例 #20
0
testpath = sess.upload_data(path='boston_test.csv',
                            bucket=bucket,
                            key_prefix='sagemaker/sklearncontainer')

pytorch_estimator = PyTorch(
    entry_point='train.py',
    source_dir=os.path.abspath(os.path.dirname(__file__)),
    role=sm_role,
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version='1.0.0',
    base_job_name='dense-pytorch',
    metric_definitions=[{
        'Name': 'median-AE',
        'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"
    }],
    hyperparameters={
        'n-epochs': 1500,
        'features':
        'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
        'target': 'target'
    },
    tags=[{
        "Key": "CostCentre",
        "Value": "SageMaker"
    }])

# launch training job, with asynchronous call
pytorch_estimator.fit({'train': trainpath, 'test': testpath}, wait=False)
コード例 #21
0
def test_tuning_step_with_multi_algo_tuner(pipeline_session, entry_point):
    pytorch_estimator = PyTorch(
        entry_point=entry_point,
        role=sagemaker.get_execution_role(),
        framework_version="1.5.0",
        py_version="py3",
        instance_count=1,
        instance_type="ml.m5.xlarge",
        sagemaker_session=pipeline_session,
        enable_sagemaker_metrics=True,
        max_retry_attempts=3,
        hyperparameters={
            "static-hp": "hp1",
            "train_size": "1280"
        },
    )

    tuner = HyperparameterTuner.create(
        estimator_dict={
            "estimator-1": pytorch_estimator,
            "estimator-2": pytorch_estimator,
        },
        objective_metric_name_dict={
            "estimator-1": "test:acc",
            "estimator-2": "test:acc",
        },
        hyperparameter_ranges_dict={
            "estimator-1": {
                "batch-size": IntegerParameter(64, 128)
            },
            "estimator-2": {
                "batch-size": IntegerParameter(256, 512)
            },
        },
        metric_definitions_dict={
            "estimator-1": [{
                "Name": "test:acc",
                "Regex": "Overall test accuracy: (.*?);"
            }],
            "estimator-2": [{
                "Name": "test:acc",
                "Regex": "Overall test accuracy: (.*?);"
            }],
        },
    )
    input_path = f"s3://{pipeline_session.default_bucket()}/training-data"
    inputs = {
        "estimator-1": TrainingInput(s3_data=input_path),
        "estimator-2": TrainingInput(s3_data=input_path),
    }
    step_args = tuner.fit(
        inputs=inputs,
        include_cls_metadata={
            "estimator-1": False,
            "estimator-2": False,
        },
    )

    step = TuningStep(
        name="MyTuningStep",
        step_args=step_args,
    )

    pipeline = Pipeline(
        name="MyPipeline",
        steps=[step],
        sagemaker_session=pipeline_session,
    )

    assert json.loads(pipeline.definition())["Steps"][0] == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": step_args,
    }
コード例 #22
0
def test_debug_hook_disabled_with_checkpointing(
    sagemaker_session,
    mxnet_training_latest_version,
    mxnet_training_latest_py_version,
    cpu_instance_type,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        s3_output_path = os.path.join("s3://",
                                      sagemaker_session.default_bucket(),
                                      str(uuid.uuid4()))
        debugger_hook_config = DebuggerHookConfig(
            s3_output_path=os.path.join(s3_output_path, "tensors"))

        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")

        # Estimator with checkpointing enabled
        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        mx._prepare_for_training()

        # Debug Hook should be enabled
        assert mx.debugger_hook_config is not None

        # Estimator with checkpointing enabled and Instance Count>1
        mx = MXNet(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=mxnet_training_latest_version,
            py_version=mxnet_training_latest_py_version,
            instance_count=2,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
            debugger_hook_config=debugger_hook_config,
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        mx._prepare_for_training()
        # Debug Hook should be disabled
        assert mx.debugger_hook_config is False

        # Estimator with checkpointing enabled and SMDataParallel Enabled
        pt = PyTorch(
            base_job_name="pytorch-smdataparallel-mnist",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="1.8.0",
            py_version="py36",
            instance_count=1,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
            distribution={
                "smdistributed": {
                    "dataparallel": {
                        "enabled": True
                    }
                }
            },
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        pt._prepare_for_training()
        # Debug Hook should be disabled
        assert pt.debugger_hook_config is False

        # Estimator with checkpointing enabled and SMModelParallel Enabled
        tf = TensorFlow(
            base_job_name="tf-smdataparallel-mnist",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="2.4.1",
            py_version="py36",
            instance_count=1,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
            distribution={
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True
                    }
                }
            },
            checkpoint_local_path="/opt/ml/checkpoints",
            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
        )
        tf._prepare_for_training()
        # Debug Hook should be disabled
        assert tf.debugger_hook_config is False

        # Estimator with checkpointing enabled with Xgboost Estimator
        xg = XGBoost(
            base_job_name="test_xgboost",
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="1.2-1",
            py_version="py3",
            instance_count=2,
            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
            instance_type="ml.p3.16xlarge",
            sagemaker_session=sagemaker_session,
            # Training using SMDataParallel Distributed Training Framework
        )
        xg._prepare_for_training()
        # Debug Hook should be enabled
        assert xg.debugger_hook_config is not None
コード例 #23
0
@pytest.mark.parametrize(
    "estimator",
    [
        SKLearn(
            framework_version="0.23-1",
            py_version="py3",
            instance_type=INSTANCE_TYPE,
            instance_count=1,
            role=sagemaker.get_execution_role(),
            entry_point="entry_point.py",
        ),
        PyTorch(
            role=sagemaker.get_execution_role(),
            instance_type=INSTANCE_TYPE,
            instance_count=1,
            framework_version="1.8.0",
            py_version="py36",
            entry_point="entry_point.py",
        ),
        TensorFlow(
            role=sagemaker.get_execution_role(),
            instance_type=INSTANCE_TYPE,
            instance_count=1,
            framework_version="2.0",
            py_version="py3",
            entry_point="entry_point.py",
        ),
        HuggingFace(
            transformers_version="4.6",
            pytorch_version="1.7",
            role=sagemaker.get_execution_role(),
コード例 #24
0
ファイル: run_aws.py プロジェクト: jkhouja/experimenter
    def my_aws_app(cfg: DictConfig) -> None:

        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra

        as_dict = OmegaConf.to_container(cfg, resolve=False)

        # Override s3 datapath
        aws_bucket = cfg.aws.bucket_prefix
        try:
            aws_root_path = aws_bucket + cfg.aws.root_path

        except errors.ConfigAttributeError:
            aws_root_path = aws_bucket + cfg.root_path

        # Get the s3 location to load /save to
        aws_out_path = aws_root_path + "/" + as_dict["output_subdir"]
        aws_data_path = aws_root_path + "/" + as_dict["data_subdir"]

        # Override the job json file with sagemaker local dirs
        as_dict["root_path"] = "/opt/ml/"
        as_dict["data_subdir"] = "input/data/train"
        as_dict["output_subdir"] = "output/data"

        # Set the local dir for tensorboard
        tb_log_dir = "/opt/ml/output/tensorboard/"
        as_dict["tb_log_dir"] = tb_log_dir
        tensorboard_output_config = TensorBoardOutputConfig(
            s3_output_path=aws_out_path,
            container_local_output_path=tb_log_dir,
        )

        print(OmegaConf.to_yaml(cfg))
        print("Overriden Root Path: " + aws_root_path)

        # Save json file to tmp location to be uploaded with script
        tmp_relative_path = "tmp/tmp_job.json"
        tmp_path = script_folder + "/" + tmp_relative_path

        with open(tmp_path, "w") as json_file:
            json.dump(as_dict, json_file)

        wait = cfg.aws.wait
        role = cfg.aws.role
        instance_count = cfg.aws.instance_count
        instance_type = cfg.aws.instance_type
        env = {
            "SAGEMAKER_REQUIREMENTS":
            "requirements.txt",  # path relative to `source_dir` below.
        }

        # Using Sagemaker prebuilt Pytorch container
        pytorch_estimator = PyTorch(
            entry_point="run.py",
            source_dir=script_folder,
            hyperparameters={"config_file": tmp_relative_path},
            role=role,
            env=env,
            instance_count=instance_count,
            py_version="py3",
            framework_version="1.5.0",
            output_path=aws_out_path,
            base_job_name=cfg.experiment_name,
            instance_type=instance_type,
            tensorboard_output_config=tensorboard_output_config,
        )

        pytorch_estimator.fit({"train": aws_data_path}, wait=wait)
        os.remove(tmp_path)
コード例 #25
0
def exec_training(
    session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot
):
    sagemaker_session = sagemaker.Session(
        boto_session=session, sagemaker_client=client
    )

    conf = yaml.load(open(setting))

    # input data
    inputs = conf["inputs"]

    if "upload_data" in conf and isinstance(conf["upload_data"], list):
        for d in conf["upload_data"]:
            s3_dir = sagemaker_session.upload_data(
                path=d["path"],
                key_prefix=os.path.join(job_name, d["key_prefix"]),
            )
            inputs[d["name"]] = s3_dir

    estimator_args = conf["estimator"]
    estimator_args["sagemaker_session"] = sagemaker_session

    hyperparameters = estimator_args.pop("hyperparameters")
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args["hyperparameters"] = fixed

    if is_spot:
        estimator_args["train_use_spot_instances"] = True
        if "checkpoint_s3_uri" not in estimator_args:
            bucket_name = sagemaker_session.default_bucket()
            uri = os.path.join("s3://", bucket_name, job_name, "checkpoints")
            estimator_args["checkpoint_s3_uri"] = uri

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, wait=False, job_name=job_name)
    else:
        if "tuner" in conf:
            tuner_args = conf["tuner"]
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
        else:  # use default values
            tuner_args = {
                "objective_metric_name": "metric_name",
                "metric_definitions": [
                    {"Name": "metric_name", "Regex": "ignore"}
                ],
                "strategy": "Random",
                "objective_type": "Maximize",
                "early_stopping_type": "Off",
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v["type"].lower() != "categorical":
                    raise ValueError(
                        "the default tuner only supports Categorigal params."
                    )
                max_jobs *= len(v["range"])
                hyperparameter_ranges[k] = hp_type[v["type"].lower()](
                    v["range"]
                )
            tuner_args["max_jobs"] = max_jobs

        tuner_args["estimator"] = estimator
        tuner_args["hyperparameter_ranges"] = hyperparameter_ranges
        tuner_args["max_parallel_jobs"] = max_parallel_jobs
        tuner_args["base_tuning_job_name"] = job_name
        tuner_args["warm_start_config"] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)
コード例 #26
0
def exec_training(session, client, job_name, setting, pytorch,
                  max_parallel_jobs):
    sagemaker_session = sagemaker.Session(boto_session=session,
                                          sagemaker_client=client)

    conf = yaml.load(open(setting))

    # input data
    inputs = conf['inputs']

    if 'upload_data' in conf and isinstance(conf['upload_data'], list):
        for d in conf['upload_data']:
            s3_dir = sagemaker_session.upload_data(path=d['path'],
                                                   key_prefix=os.path.join(
                                                       job_name,
                                                       d['key_prefix']))
            inputs[d['name']] = s3_dir

    estimator_args = conf['estimator']
    estimator_args['sagemaker_session'] = sagemaker_session

    hyperparameters = estimator_args.pop('hyperparameters')
    fixed, targets = {}, {}
    for k, v in hyperparameters.items():
        if isinstance(v, dict):
            targets[k] = v
        else:
            fixed[k] = v
    estimator_args['hyperparameters'] = fixed

    if pytorch:
        estimator = PyTorch(**estimator_args)
    else:
        estimator = Chainer(**estimator_args)

    if len(targets) == 0:
        estimator.fit(inputs, job_name=job_name)
    else:
        if 'tuner' in conf:
            tuner_args = conf['tuner']
            hyperparameter_ranges = {}
            for k, v in targets.items():
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
        else:  # use default values
            tuner_args = {
                'objective_metric_name': 'metric_name',
                'metric_definitions': [{
                    'Name': 'metric_name',
                    'Regex': 'ignore'
                }],
                'strategy': 'Random',
                'objective_type': 'Maximize',
                'early_stopping_type': 'Off'
            }
            max_jobs = 1
            hyperparameter_ranges = {}
            for k, v in targets.items():
                if v['type'].lower() != 'categorical':
                    raise ValueError(
                        'the default tuner only supports Categorigal params.')
                max_jobs *= len(v['range'])
                hyperparameter_ranges[k] = hp_type[v['type'].lower()](
                    v['range'])
            tuner_args['max_jobs'] = max_jobs

        tuner_args['estimator'] = estimator
        tuner_args['hyperparameter_ranges'] = hyperparameter_ranges
        tuner_args['max_parallel_jobs'] = max_parallel_jobs
        tuner_args['base_tuning_job_name'] = job_name
        tuner_args['warm_start_config'] = None  # not supported yet.

        tuner = HyperparameterTuner(**tuner_args)
        tuner.fit(inputs, job_name=job_name)