def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")
    job_name = unique_name_from_base("byo")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = "test_byo_estimator"
        key = "recordio-pb-data"

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_name=image_name,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        # training labels must be 'float32'
        estimator.fit({"train": s3_train_data}, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=job_name)
        predictor.serializer = fm_serializer
        predictor.content_type = "application/json"
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
def test_generic_to_deploy(sagemaker_session):
    e = Estimator(IMAGE_NAME,
                  ROLE,
                  INSTANCE_COUNT,
                  INSTANCE_TYPE,
                  output_path=OUTPUT_PATH,
                  sagemaker_session=sagemaker_session)

    e.set_hyperparameters(**HYPERPARAMS)

    e.fit({'train': 's3://bucket/training-prefix'})

    predictor = e.deploy(INSTANCE_COUNT, INSTANCE_TYPE)

    sagemaker_session.train.assert_called_once()
    assert len(sagemaker_session.train.call_args[0]) == 0
    args = sagemaker_session.train.call_args[1]
    assert args['job_name'].startswith(IMAGE_NAME)

    args.pop('job_name')
    args.pop('role')

    assert args == HP_TRAIN_CALL

    sagemaker_session.create_model.assert_called_once()
    args = sagemaker_session.create_model.call_args[0]
    assert args[0].startswith(IMAGE_NAME)
    assert args[1] == ROLE
    assert args[2]['Image'] == IMAGE_NAME
    assert args[2]['ModelDataUrl'] == MODEL_DATA

    assert isinstance(predictor, RealTimePredictor)
    assert predictor.endpoint.startswith(IMAGE_NAME)
    assert predictor.sagemaker_session == sagemaker_session
def test_generic_to_deploy(sagemaker_session):
    e = Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, output_path=OUTPUT_PATH,
                  sagemaker_session=sagemaker_session)

    e.set_hyperparameters(**HYPERPARAMS)

    e.fit({'train': 's3://bucket/training-prefix'})

    predictor = e.deploy(INSTANCE_COUNT, INSTANCE_TYPE)

    sagemaker_session.train.assert_called_once()
    assert len(sagemaker_session.train.call_args[0]) == 0
    args = sagemaker_session.train.call_args[1]
    assert args['job_name'].startswith(IMAGE_NAME)

    args.pop('job_name')
    args.pop('role')

    assert args == HP_TRAIN_CALL

    sagemaker_session.create_model.assert_called_once()
    args = sagemaker_session.create_model.call_args[0]
    assert args[0].startswith(IMAGE_NAME)
    assert args[1] == ROLE
    assert args[2]['Image'] == IMAGE_NAME
    assert args[2]['ModelDataUrl'] == MODEL_DATA

    assert isinstance(predictor, RealTimePredictor)
    assert predictor.endpoint.startswith(IMAGE_NAME)
    assert predictor.sagemaker_session == sagemaker_session
Exemple #4
0
def create_blaxing_text_model(
        region_name: str,
        sm_session: Session,
        sm_role: str,
        s3_input_url: str,
        s3_output_url: str):
    """
    Create a BlazingText model.

    Args:
        - region_name: AWS Region Name to use SageMaker in.
        - sm_session: SageMaker Session Object.
        - sm_role: SageMaker role arn that allows SM to connect to s3.
        - s3_input_url: training data input path on s3
        - s3_output_url: model artifacts output path

    Return:
        - bt_model: instance of Estimator, can be used to deploy an inference endpoint
    """
    # define container
    container = get_image_uri(region_name, "blazingtext", "latest")

    # create estimator
    bt_model = Estimator(container,
                         sm_role,
                         train_instance_count=1,
                         train_instance_type='ml.c4.2xlarge',
                         train_volume_size=30,
                         train_max_run=360000,
                         input_mode='File',
                         output_path=s3_output_url,
                         sagemaker_session=sm_session)

    # set hyperparameters
    bt_model.set_hyperparameters(mode="skipgram",
                                 epochs=5,
                                 min_count=5,
                                 sampling_threshold=0.0001,
                                 learning_rate=0.05,
                                 window_size=5,
                                 vector_dim=100,
                                 negative_samples=5,
                                 subwords=True,
                                 min_char=3,
                                 max_char=6,
                                 batch_size=11,
                                 evaluation=True)

    # define data channels
    train_data = s3_input(s3_input_url, distribution='FullyReplicated',
                          content_type='text/plain', s3_data_type='S3Prefix')
    data_channels = {'train': train_data}

    # fit model
    bt_model.fit(inputs=data_channels, logs=True)

    return bt_model
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = unique_name_from_base("byo")
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")
    job_name = unique_name_from_base("byo")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = "test_byo_estimator"
        key = "recordio-pb-data"

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_name=image_name,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        # training labels must be 'float32'
        estimator.fit({"train": s3_train_data}, wait=False, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=job_name,
                                     sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1,
                                 "ml.m4.xlarge",
                                 endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = "application/json"
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None

        assert estimator.train_image() == image_name
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = unique_name_from_base('byo')
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, 'train',
                                                          key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole',
                              train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session,
                              base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name,
                                     sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1,
                                 'ml.m4.xlarge',
                                 endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
Exemple #7
0
def test_byo_estimator(sagemaker_session, region, cpu_instance_type,
                       training_set):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_uri = image_uris.retrieve("factorization-machines", region)
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")
    job_name = unique_name_from_base("byo")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        prefix = "test_byo_estimator"
        key = "recordio-pb-data"

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_uri=image_uri,
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        # training labels must be 'float32'
        estimator.fit({"train": s3_train_data}, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(
            1,
            cpu_instance_type,
            endpoint_name=job_name,
            serializer=_FactorizationMachineSerializer(),
            deserializer=sagemaker.deserializers.JSONDeserializer(),
        )

        result = predictor.predict(training_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
Exemple #9
0
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
Exemple #11
0
    def estimator(self, batch_n):
        ll_estimator = Estimator(self.container,
                                 role=self.role,
                                 instance_count=1,
                                 instance_type='ml.m5.large',
                                 output_path='s3://{}/{}/output'.format(
                                     self.bucket, self.prefix))

        ll_estimator.set_hyperparameters(predictor_type='regressor',
                                         mini_batch_size=batch_n)

        return ll_estimator
Exemple #12
0
def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type,
                             training_set):
    image_uri = image_uris.retrieve("factorization-machines", region)
    endpoint_name = unique_name_from_base("byo")
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")
    job_name = unique_name_from_base("byo")

    with timeout(minutes=5):
        prefix = "test_byo_estimator"
        key = "recordio-pb-data"

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_uri=image_uri,
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        # training labels must be 'float32'
        estimator.fit({"train": s3_train_data}, wait=False, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=job_name,
                                     sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(
            1,
            cpu_instance_type,
            endpoint_name=endpoint_name,
            serializer=_FactorizationMachineSerializer(),
            deserializer=sagemaker.deserializers.JSONDeserializer(),
        )

        result = predictor.predict(training_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None

        assert estimator.training_image_uri() == image_uri
def test_generic_training_job_analytics(sagemaker_session):
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job',
        return_value={
            'TuningJobArn':
            'arn:aws:sagemaker:us-west-2:968277160000:hyper-parameter-tuning-job/mock-tuner',
            'TrainingStartTime': 1530562991.299,
        })
    sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job = Mock(
        name='describe_hyper_parameter_tuning_job',
        return_value={
            'TrainingJobDefinition': {
                "AlgorithmSpecification": {
                    "TrainingImage":
                    "some-image-url",
                    "TrainingInputMode":
                    "File",
                    "MetricDefinitions": [{
                        "Name":
                        "train:loss",
                        "Regex":
                        "train_loss=([0-9]+\\.[0-9]+)"
                    }, {
                        "Name":
                        "validation:loss",
                        "Regex":
                        "valid_loss=([0-9]+\\.[0-9]+)"
                    }]
                }
            }
        })

    e = Estimator(IMAGE_NAME,
                  ROLE,
                  INSTANCE_COUNT,
                  INSTANCE_TYPE,
                  output_path=OUTPUT_PATH,
                  sagemaker_session=sagemaker_session)

    with pytest.raises(ValueError) as err:  # noqa: F841
        # No training job yet
        a = e.training_job_analytics
        assert a is not None  # This line is never reached

    e.set_hyperparameters(**HYPERPARAMS)
    e.fit({'train': 's3://bucket/training-prefix'})
    a = e.training_job_analytics
    assert a is not None
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
Exemple #15
0
    def create_estimator(self, role, output_path, hyperparameters,
                         sagemaker_session, **kwargs):

        estimator = Estimator(
            self.algo_image_uri,
            role=role,
            instance_count=self.training_resource_config["instance_count"],
            instance_type=self.training_resource_config["instance_type"],
            output_path=output_path,
            sagemaker_session=sagemaker_session,
            **kwargs,
        )
        hyperparameters.update(self.candidate_specific_static_hps)
        estimator.set_hyperparameters(**hyperparameters)

        return estimator
Exemple #16
0
def estimator_knn(sagemaker_session, cpu_instance_type):
    knn_image = image_uris.retrieve("knn", sagemaker_session.boto_region_name)

    estimator = Estimator(
        image_uri=knn_image,
        role=EXECUTION_ROLE,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
    )

    estimator.set_hyperparameters(k=10,
                                  sample_size=500,
                                  feature_dim=784,
                                  mini_batch_size=100,
                                  predictor_type="regressor")
    return estimator
def test_generic_to_fit_with_hps(sagemaker_session):
    e = Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, output_path=OUTPUT_PATH,
                  sagemaker_session=sagemaker_session)

    e.set_hyperparameters(**HYPERPARAMS)

    e.fit({'train': 's3://bucket/training-prefix'})

    sagemaker_session.train.assert_called_once()
    assert len(sagemaker_session.train.call_args[0]) == 0
    args = sagemaker_session.train.call_args[1]
    assert args['job_name'].startswith(IMAGE_NAME)

    args.pop('job_name')
    args.pop('role')

    assert args == HP_TRAIN_CALL
Exemple #18
0
def estimator_fm(sagemaker_session, cpu_instance_type):
    fm_image = image_uris.retrieve("factorization-machines",
                                   sagemaker_session.boto_region_name)

    estimator = Estimator(
        image_uri=fm_image,
        role=EXECUTION_ROLE,
        instance_count=1,
        instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
    )

    estimator.set_hyperparameters(num_factors=10,
                                  feature_dim=784,
                                  mini_batch_size=100,
                                  predictor_type="regressor")

    return estimator
Exemple #19
0
def estimator_knn(sagemaker_session, cpu_instance_type):
    knn_image = get_image_uri(sagemaker_session.boto_session.region_name,
                              "knn",
                              repo_version="1")

    estimator = Estimator(
        image_name=knn_image,
        role=EXECUTION_ROLE,
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
    )

    estimator.set_hyperparameters(k=10,
                                  sample_size=500,
                                  feature_dim=784,
                                  mini_batch_size=100,
                                  predictor_type="regressor")
    return estimator
def test_generic_training_job_analytics(sagemaker_session):
    sagemaker_session.sagemaker_client.describe_training_job = Mock(name='describe_training_job', return_value={
        'TuningJobArn': 'arn:aws:sagemaker:us-west-2:968277160000:hyper-parameter-tuning-job/mock-tuner',
        'TrainingStartTime': 1530562991.299,
    })
    sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job = Mock(
        name='describe_hyper_parameter_tuning_job',
        return_value={
            'TrainingJobDefinition': {
                "AlgorithmSpecification": {
                    "TrainingImage": "some-image-url",
                    "TrainingInputMode": "File",
                    "MetricDefinitions": [
                        {
                            "Name": "train:loss",
                            "Regex": "train_loss=([0-9]+\\.[0-9]+)"
                        },
                        {
                            "Name": "validation:loss",
                            "Regex": "valid_loss=([0-9]+\\.[0-9]+)"
                        }
                    ]
                }
            }
        }
    )

    e = Estimator(IMAGE_NAME, ROLE, INSTANCE_COUNT, INSTANCE_TYPE, output_path=OUTPUT_PATH,
                  sagemaker_session=sagemaker_session)

    with pytest.raises(ValueError) as err:  # noqa: F841
        # No training job yet
        a = e.training_job_analytics
        assert a is not None  # This line is never reached

    e.set_hyperparameters(**HYPERPARAMS)
    e.fit({'train': 's3://bucket/training-prefix'})
    a = e.training_job_analytics
    assert a is not None
    def _get_model(self, hyperparameters: Dict = {}) -> Estimator:
        """
        Initializes the model. This can be used to train later or attach an existing model

        Arguments:
            hyperparameters: The hyperparameters for the Estimator model

        Returns:
            model: The initialized model
        """
        container = get_image_uri(self.executor.boto_session.region_name,
                                  self.container_name)
        model = Estimator(
            container,
            **self.executor.default_model_kwargs,
            output_path=self.output_path,
        )
        used_hyperparameters = self.default_hyperparameters
        used_hyperparameters["feature_dim"] = len(self.data.feature_columns)
        used_hyperparameters.update(hyperparameters)

        model.set_hyperparameters(**used_hyperparameters)
        return model
Exemple #22
0
    def tuning(self):

        s3_bucket, id, secret = s3_aws_engine(name=self.aws_env)

        s3_path = ModelTune._aws_s3_path(s3_bucket)

        boto_sess = ModelTune._boto_session(id, secret)

        logger.info('Getting algorithm image URI...')

        container = get_image_uri(boto_sess.region_name,
                                  'xgboost',
                                  repo_version='0.90-1')

        logger.info('Creating sagemaker session...')

        sage_sess = sagemaker.Session(boto_sess)

        s3_input_train, s3_input_val = self.fetch_data(s3_path)

        logger.info(
            'Creating sagemaker estimator to train using the supplied {} model...'
            .format(self.model_name))

        if self.model_name == 'clf':
            train_instance_type = 'ml.m5.4xlarge'
        else:
            train_instance_type = 'ml.m5.2xlarge'

        est = Estimator(container,
                        role=self.role,
                        train_instance_count=1,
                        train_instance_type=train_instance_type,
                        output_path=s3_path + 'tuning_' + self.model_name +
                        '/',
                        sagemaker_session=sage_sess,
                        base_job_name=self.model_name + '-tuning-job')

        logger.info('Setting hyper-parameters...')

        hyperparameter_ranges = {
            'num_round': IntegerParameter(1, 4000),
            'eta': ContinuousParameter(0, 0.5),
            'max_depth': IntegerParameter(1, 10),
            'min_child_weight': ContinuousParameter(0, 120),
            'subsample': ContinuousParameter(0.5, 1),
            'colsample_bytree': ContinuousParameter(0.5, 1),
            'gamma': ContinuousParameter(0, 5),
            'lambda': ContinuousParameter(0, 1000),
            'alpha': ContinuousParameter(0, 1000)
        }

        if self.model_name == 'clf':
            est.set_hyperparameters(
                objective='reg:logistic',
                scale_pos_weight=self._get_imb_ratio()['imb_ratio'])
            objective_metric_name = 'validation:f1'
            objective_type = 'Maximize'
        else:
            est.set_hyperparameters(objective='reg:linear')
            objective_metric_name = 'validation:rmse'
            objective_type = 'Minimize'

        if est.hyperparam_dict is None:
            raise ValueError('Hyper-parameters are missing')
        else:
            logger.info(est.hyperparam_dict)

        tuner = HyperparameterTuner(
            estimator=est,
            objective_metric_name=objective_metric_name,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type=objective_type,
            max_jobs=100,
            max_parallel_jobs=10)

        sw = Stopwatch(start=True)

        tuner.fit({'train': s3_input_train, 'validation': s3_input_val})

        self.post_tune(sage_sess, tuner)

        logger.info('Elapsed time of tuning: {}'.format(
            sw.elapsed.human_str()))
Exemple #23
0
# create estimator
fm_estimator = Estimator(
    image_uri=container,
    role=role,
    sagemaker_session=sagemaker.session.Session(sess),
    train_instance_count=1,
    train_instance_type="ml.c5.4xlarge",
    train_volume_size=30,
    train_max_run=3600,
    output_path="s3://train/",  # replace
    base_job_name="trng-recommender")

# set hyperparameters for the estimator
fm_estimator.set_hyperparameters(feature_dim=178729,
                                 epochs=10,
                                 mini_batch_size=200,
                                 num_factors=64,
                                 predictor_type='regressor')

# train_config specifies SageMaker training configuration
train_config = training_config(estimator=fm_estimator,
                               inputs=config["train_model"]["inputs"])

# create tuner
fm_tuner = HyperparameterTuner(estimator=fm_estimator,
                               **config["tune_model"]["tuner_config"])

# create tuning config
tuner_config = tuning_config(tuner=fm_tuner,
                             inputs=config["tune_model"]["inputs"])
Exemple #24
0
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"

    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
Exemple #25
0
def test_tuning_byo_estimator(sagemaker_session):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_name = registry(sagemaker_session.boto_session.region_name
                          ) + "/factorization-machines:1"
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = "test_byo_estimator"
        key = "recordio-pb-data"
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_name=image_name,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name="test:binary_classification_accuracy",
            hyperparameter_ranges=hyperparameter_ranges,
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuner.fit(
            {
                "train": s3_train_data,
                "test": s3_train_data
            },
            include_cls_metadata=False,
            job_name=unique_name_from_base("byo", 32),
        )

        print("Started hyperparameter tuning job with name:" +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1,
                                 "ml.m4.xlarge",
                                 endpoint_name=best_training_job)
        predictor.serializer = _fm_serializer
        predictor.content_type = "application/json"
        predictor.deserializer = json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_uri = image_uris.retrieve("factorization-machines",
                                    sagemaker_session.boto_region_name)
    training_data_path = os.path.join(DATA_DIR, "dummy_tensor")

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        prefix = "test_byo_estimator"
        key = "recordio-pb-data"
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, "train",
                                                          key))

        estimator = Estimator(
            image_uri=image_uri,
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type="binary_classifier")

        hyperparameter_ranges = {"mini_batch_size": IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name="test:binary_classification_accuracy",
            hyperparameter_ranges=hyperparameter_ranges,
            max_jobs=2,
            max_parallel_jobs=2,
        )

        tuning_job_name = unique_name_from_base("byo", 32)
        print("Started hyperparameter tuning job with name {}:".format(
            tuning_job_name))
        tuner.fit(
            {
                "train": s3_train_data,
                "test": s3_train_data
            },
            include_cls_metadata=False,
            job_name=tuning_job_name,
        )

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(
            1,
            cpu_instance_type,
            endpoint_name=best_training_job,
            serializer=_FactorizationMachineSerializer(),
            deserializer=JSONDeserializer(),
        )

        result = predictor.predict(datasets.one_p_mnist()[0][:10])

        assert len(result["predictions"]) == 10
        for prediction in result["predictions"]:
            assert prediction["score"] is not None
def get_pipeline(
        region,
        role=None,
        default_bucket=None,
        model_package_group_name="CustomerChurnPackageGroup",  # Choose any name
        pipeline_name="CustomerChurnDemo-p-ewf8t7lvhivm",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
        base_job_prefix="CustomerChurn",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on CustomerChurn data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value=
        "PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sm-pipelines-demo-data-123456789/churn.txt",  # Change this to point to the s3 location of your raw input data.
    )

    # Processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=
        f"{base_job_prefix}/sklearn-CustomerChurn-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="CustomerChurnProcess",  # choose any name
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/CustomerChurnTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework=
        "xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/CustomerChurn-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="binary:logistic",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="CustomerChurnTrain",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # Processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-CustomerChurn-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="CustomerChurnEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # Register model step that will be conditionally executed
    model_metrics = ModelMetrics(model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
        content_type="application/json",
    ))

    # Register model step that will be conditionally executed
    step_register = RegisterModel(
        name="CustomerChurnRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path=
            "binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=0.8,  # You can change the threshold here
    )
    step_cond = ConditionStep(
        name="CustomerChurnAccuracyCond",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="TestPackageGroup",
    pipeline_name="TestPipeline",
    base_job_prefix="Test",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(
        name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(name="ProcessingInstanceType",
                                               default_value="ml.m5.xlarge")
    training_instance_type = ParameterString(name="TrainingInstanceType",
                                             default_value="ml.m5.xlarge")
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval")
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=
        f"s3://sagemaker-servicecatalog-seedcode-{region}/dataset/abalone-dataset.csv",
    )

    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-test-preprocess",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="PreprocessTestData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation",
                             source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test",
                             source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/TestTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/test-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="TrainTestModel",
        estimator=xgb_train,
        inputs={
            "train":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["train"].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation":
            TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.
                Outputs["validation"].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-test-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    evaluation_report = PropertyFile(
        name="TestEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateTestModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.
                Outputs["test"].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation",
                             source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]
            ["S3Output"]["S3Uri"]),
                                       content_type="application/json"))
    step_register = RegisterModel(
        name="RegisterTestModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(step=step_eval,
                     property_file=evaluation_report,
                     json_path="regression_metrics.mse.value"),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSETestEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemple #29
0
def test_single_algo_tuning_step(sagemaker_session):
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=1,
        instance_type="ml.c5.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
    )
    estimator.set_hyperparameters(
        num_layers=18,
        image_shape="3,224,224",
        num_classes=257,
        num_training_samples=15420,
        mini_batch_size=128,
        epochs=10,
        optimizer="sgd",
        top_k="2",
        precision_dtype="float32",
        augmentation_type="crop",
    )

    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(0.0001, 0.05),
        "momentum": ContinuousParameter(0.0, 0.99),
        "weight_decay": ContinuousParameter(0.0, 0.99),
    }

    tuner = HyperparameterTuner(
        estimator=estimator,
        objective_metric_name="val:accuracy",
        hyperparameter_ranges=hyperparameter_ranges,
        objective_type="Maximize",
        max_jobs=5,
        max_parallel_jobs=2,
        early_stopping_type="OFF",
        strategy="Bayesian",
        warm_start_config=WarmStartConfig(
            warm_start_type=WarmStartTypes.IDENTICAL_DATA_AND_ALGORITHM,
            parents=set(["parent-hpo"]),
        ),
    )

    inputs = TrainingInput(s3_data=data_source_uri_parameter)

    tuning_step = TuningStep(
        name="MyTuningStep",
        tuner=tuner,
        inputs=inputs,
    )

    assert tuning_step.to_request() == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": {
            "HyperParameterTuningJobConfig": {
                "Strategy": "Bayesian",
                "ResourceLimits": {
                    "MaxNumberOfTrainingJobs": 5,
                    "MaxParallelTrainingJobs": 2
                },
                "TrainingJobEarlyStoppingType": "OFF",
                "HyperParameterTuningJobObjective": {
                    "Type": "Maximize",
                    "MetricName": "val:accuracy",
                },
                "ParameterRanges": {
                    "ContinuousParameterRanges": [
                        {
                            "Name": "learning_rate",
                            "MinValue": "0.0001",
                            "MaxValue": "0.05",
                            "ScalingType": "Auto",
                        },
                        {
                            "Name": "momentum",
                            "MinValue": "0.0",
                            "MaxValue": "0.99",
                            "ScalingType": "Auto",
                        },
                        {
                            "Name": "weight_decay",
                            "MinValue": "0.0",
                            "MaxValue": "0.99",
                            "ScalingType": "Auto",
                        },
                    ],
                    "CategoricalParameterRanges": [],
                    "IntegerParameterRanges": [],
                },
            },
            "TrainingJobDefinition": {
                "StaticHyperParameters": {
                    "num_layers": "18",
                    "image_shape": "3,224,224",
                    "num_classes": "257",
                    "num_training_samples": "15420",
                    "mini_batch_size": "128",
                    "epochs": "10",
                    "optimizer": "sgd",
                    "top_k": "2",
                    "precision_dtype": "float32",
                    "augmentation_type": "crop",
                },
                "RoleArn":
                "DummyRole",
                "OutputDataConfig": {
                    "S3OutputPath": "s3://my-bucket/"
                },
                "ResourceConfig": {
                    "InstanceCount": 1,
                    "InstanceType": "ml.c5.4xlarge",
                    "VolumeSizeInGB": 30,
                },
                "StoppingCondition": {
                    "MaxRuntimeInSeconds": 86400
                },
                "AlgorithmSpecification": {
                    "TrainingInputMode": "File",
                    "TrainingImage": "fakeimage",
                },
                "InputDataConfig": [{
                    "DataSource": {
                        "S3DataSource": {
                            "S3DataType": "S3Prefix",
                            "S3Uri": data_source_uri_parameter,
                            "S3DataDistributionType": "FullyReplicated",
                        }
                    },
                    "ChannelName": "training",
                }],
            },
            "WarmStartConfig": {
                "WarmStartType":
                "IdenticalDataAndAlgorithm",
                "ParentHyperParameterTuningJobs": [{
                    "HyperParameterTuningJobName":
                    "parent-hpo",
                }],
            },
        },
    }

    assert tuning_step.properties.HyperParameterTuningJobName.expr == {
        "Get": "Steps.MyTuningStep.HyperParameterTuningJobName"
    }
    assert tuning_step.properties.TrainingJobSummaries[
        0].TrainingJobName.expr == {
            "Get": "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName"
        }
    assert tuning_step.get_top_model_s3_uri(
        0, "my-bucket", "my-prefix"
    ).expr == {
        "Std:Join": {
            "On":
            "/",
            "Values": [
                "s3:/",
                "my-bucket",
                "my-prefix",
                {
                    "Get":
                    "Steps.MyTuningStep.TrainingJobSummaries[0].TrainingJobName"
                },
                "output/model.tar.gz",
            ],
        }
    }
Exemple #30
0
def test_multi_algo_tuning_step(sagemaker_session):
    data_source_uri_parameter = ParameterString(
        name="DataSourceS3Uri", default_value=f"s3://{BUCKET}/train_manifest")
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    estimator = Estimator(
        image_uri=IMAGE_URI,
        role=ROLE,
        instance_count=instance_count,
        instance_type="ml.c5.4xlarge",
        profiler_config=ProfilerConfig(system_monitor_interval_millis=500),
        rules=[],
        sagemaker_session=sagemaker_session,
        max_retry_attempts=10,
    )

    estimator.set_hyperparameters(
        num_layers=18,
        image_shape="3,224,224",
        num_classes=257,
        num_training_samples=15420,
        mini_batch_size=128,
        epochs=10,
        optimizer="sgd",
        top_k="2",
        precision_dtype="float32",
        augmentation_type="crop",
    )

    initial_lr_param = ParameterString(name="InitialLR",
                                       default_value="0.0001")
    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(initial_lr_param, 0.05),
        "momentum": ContinuousParameter(0.0, 0.99),
        "weight_decay": ContinuousParameter(0.0, 0.99),
    }

    tuner = HyperparameterTuner.create(
        estimator_dict={
            "estimator-1": estimator,
            "estimator-2": estimator,
        },
        objective_type="Minimize",
        objective_metric_name_dict={
            "estimator-1": "val:loss",
            "estimator-2": "val:loss",
        },
        hyperparameter_ranges_dict={
            "estimator-1": hyperparameter_ranges,
            "estimator-2": hyperparameter_ranges,
        },
    )

    inputs = TrainingInput(s3_data=data_source_uri_parameter)

    tuning_step = TuningStep(
        name="MyTuningStep",
        tuner=tuner,
        inputs={
            "estimator-1": inputs,
            "estimator-2": inputs,
        },
    )

    assert tuning_step.to_request() == {
        "Name": "MyTuningStep",
        "Type": "Tuning",
        "Arguments": {
            "HyperParameterTuningJobConfig": {
                "Strategy": "Bayesian",
                "ResourceLimits": {
                    "MaxNumberOfTrainingJobs": 1,
                    "MaxParallelTrainingJobs": 1
                },
                "TrainingJobEarlyStoppingType": "Off",
            },
            "TrainingJobDefinitions": [
                {
                    "StaticHyperParameters": {
                        "num_layers": "18",
                        "image_shape": "3,224,224",
                        "num_classes": "257",
                        "num_training_samples": "15420",
                        "mini_batch_size": "128",
                        "epochs": "10",
                        "optimizer": "sgd",
                        "top_k": "2",
                        "precision_dtype": "float32",
                        "augmentation_type": "crop",
                    },
                    "RoleArn":
                    "DummyRole",
                    "OutputDataConfig": {
                        "S3OutputPath": "s3://my-bucket/"
                    },
                    "ResourceConfig": {
                        "InstanceCount": 1,
                        "InstanceType": "ml.c5.4xlarge",
                        "VolumeSizeInGB": 30,
                    },
                    "StoppingCondition": {
                        "MaxRuntimeInSeconds": 86400
                    },
                    "AlgorithmSpecification": {
                        "TrainingInputMode": "File",
                        "TrainingImage": "fakeimage",
                    },
                    "InputDataConfig": [{
                        "DataSource": {
                            "S3DataSource": {
                                "S3DataType": "S3Prefix",
                                "S3Uri": data_source_uri_parameter,
                                "S3DataDistributionType": "FullyReplicated",
                            }
                        },
                        "ChannelName": "training",
                    }],
                    "DefinitionName":
                    "estimator-1",
                    "TuningObjective": {
                        "Type": "Minimize",
                        "MetricName": "val:loss"
                    },
                    "HyperParameterRanges": {
                        "ContinuousParameterRanges": [
                            {
                                "Name": "learning_rate",
                                "MinValue": initial_lr_param,
                                "MaxValue": "0.05",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "momentum",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "weight_decay",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                        ],
                        "CategoricalParameterRanges": [],
                        "IntegerParameterRanges": [],
                    },
                    "RetryStrategy": {
                        "MaximumRetryAttempts": 10,
                    },
                },
                {
                    "StaticHyperParameters": {
                        "num_layers": "18",
                        "image_shape": "3,224,224",
                        "num_classes": "257",
                        "num_training_samples": "15420",
                        "mini_batch_size": "128",
                        "epochs": "10",
                        "optimizer": "sgd",
                        "top_k": "2",
                        "precision_dtype": "float32",
                        "augmentation_type": "crop",
                    },
                    "RoleArn":
                    "DummyRole",
                    "OutputDataConfig": {
                        "S3OutputPath": "s3://my-bucket/"
                    },
                    "ResourceConfig": {
                        "InstanceCount": 1,
                        "InstanceType": "ml.c5.4xlarge",
                        "VolumeSizeInGB": 30,
                    },
                    "StoppingCondition": {
                        "MaxRuntimeInSeconds": 86400
                    },
                    "AlgorithmSpecification": {
                        "TrainingInputMode": "File",
                        "TrainingImage": "fakeimage",
                    },
                    "InputDataConfig": [{
                        "DataSource": {
                            "S3DataSource": {
                                "S3DataType": "S3Prefix",
                                "S3Uri": data_source_uri_parameter,
                                "S3DataDistributionType": "FullyReplicated",
                            }
                        },
                        "ChannelName": "training",
                    }],
                    "DefinitionName":
                    "estimator-2",
                    "TuningObjective": {
                        "Type": "Minimize",
                        "MetricName": "val:loss"
                    },
                    "HyperParameterRanges": {
                        "ContinuousParameterRanges": [
                            {
                                "Name": "learning_rate",
                                "MinValue": initial_lr_param,
                                "MaxValue": "0.05",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "momentum",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                            {
                                "Name": "weight_decay",
                                "MinValue": "0.0",
                                "MaxValue": "0.99",
                                "ScalingType": "Auto",
                            },
                        ],
                        "CategoricalParameterRanges": [],
                        "IntegerParameterRanges": [],
                    },
                    "RetryStrategy": {
                        "MaximumRetryAttempts": 10,
                    },
                },
            ],
        },
    }
Exemple #31
0
def get_pipeline(
    region,
    role=None,
    default_bucket=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",
):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Create cache configuration
    cache_config = CacheConfig(enable_caching=True, expire_after="T30m")

    # Create SKlean processor object
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=role,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name="credit-processing-job"
    )

    # Use the sklearn_processor in a Sagemaker pipelines ProcessingStep
    step_preprocess_data = ProcessingStep(
        name="PreprocessCreditData",
        processor=sklearn_processor,
        cache_config=cache_config,
        inputs=[
          ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test"),
            ProcessingOutput(output_name="baseline_with_headers", source="/opt/ml/processing/output/baseline")
        ],
        code=os.path.join(BASE_DIR, "preprocessing.py"),
    )


    # Where to store the trained model
    model_path = f"s3://{default_bucket}/CreditTrain"

    # Fetch container to use for training
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.2-2",
        py_version="py3",
        instance_type=training_instance_type,
    )

    # Create XGBoost estimator object
    xgb_estimator = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        disable_profiler=True,
    )

    # Specify hyperparameters
    xgb_estimator.set_hyperparameters(max_depth=5,
                            eta=0.2,
                            gamma=4,
                            min_child_weight=6,
                            subsample=0.8,
                            objective='binary:logistic',
                            num_round=25)

    # Use the xgb_estimator in a Sagemaker pipelines ProcessingStep. 
    # NOTE how the input to the training job directly references the output of the previous step.
    step_train_model = TrainingStep(
        name="TrainCreditModel",
        estimator=xgb_estimator,
        cache_config=cache_config,
        inputs={
            "train": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv"
            ),
            "validation": TrainingInput(
                s3_data=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv"
            )
        },
    )

    # Create ScriptProcessor object.
    evaluate_model_processor = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name="script-credit-eval",
        role=role,
    )

    # Create a PropertyFile
    # We use a PropertyFile to be able to reference outputs from a processing step, for instance to use in a condition step, which we'll see later on.
    # For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
    evaluation_report = PropertyFile(
        name="EvaluationReport",
        output_name="evaluation",
        path="evaluation.json"
    )

    # Use the evaluate_model_processor in a Sagemaker pipelines ProcessingStep. 
    step_evaluate_model = ProcessingStep(
        name="EvaluateCreditModel",
        processor=evaluate_model_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model"
            ),
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test"
            )
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluation.py"),
        property_files=[evaluation_report],
    )


    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    # Crete a RegisterModel step, which registers your model with Sagemaker Model Registry.
    step_register_model = RegisterModel(
        name="RegisterCreditModel",
        estimator=xgb_estimator,
        model_data=step_train_model.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics
    )


    # Create Processor object using the model monitor image
    baseline_processor = sagemaker.processing.Processor(
        base_job_name="credit-risk-baseline-processor",
        image_uri=sagemaker.image_uris.retrieve(framework='model-monitor', region='eu-west-1'),
        role=role,
        instance_count=1,
        instance_type=processing_instance_type,
        env = {
            "dataset_format": "{\"csv\": {\"header\": true} }",
            "dataset_source": "/opt/ml/processing/sm_input",
            "output_path": "/opt/ml/processing/sm_output",
            "publish_cloudwatch_metrics": "Disabled"
        }
    )

    # Create a Sagemaker Pipeline step, using the baseline_processor.
    step_create_data_baseline = ProcessingStep(
        name="CreateModelQualityBaseline",
        processor=baseline_processor,
        cache_config=cache_config,
        inputs=[
            ProcessingInput(
                source=step_preprocess_data.properties.ProcessingOutputConfig.Outputs[
                    "baseline_with_headers"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/sm_input",
            )
        ],
        outputs=[
            ProcessingOutput(
                source="/opt/ml/processing/sm_output",
                destination="s3://{}/{}/baseline".format(default_bucket, base_job_prefix),
                output_name="baseline_result",
            )
        ],
    )



    # Create Condition
    cond_gte = ConditionGreaterThanOrEqualTo(
        left=JsonGet(
            step=step_evaluate_model,
            property_file=evaluation_report,
            json_path="binary_classification_metrics.accuracy.value"
        ),
        right=0.7
    )

    # Create a Sagemaker Pipelines ConditionStep, using the condition we just created.
    step_cond = ConditionStep(
        name="AccuracyCondition",
        conditions=[cond_gte],
        if_steps=[step_register_model],
        else_steps=[], 
    )

    from sagemaker.workflow.pipeline import Pipeline

    # Create a Sagemaker Pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type, 
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_preprocess_data, step_train_model, step_evaluate_model, step_create_data_baseline, step_cond],
    )
    
    return pipeline
def get_pipeline(
    region,
    security_group_ids,
    subnets,
    processing_role=None,
    training_role=None,
    data_bucket=None,
    model_bucket=None,
    model_package_group_name="AbalonePackageGroup",
    pipeline_name="AbalonePipeline",
    base_job_prefix="Abalone",

):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        processing_role: IAM role to create and run processing steps
        training_role: IAM role to create and run training steps
        data_bucket: the bucket to use for storing the artifacts

    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, data_bucket)

    if processing_role is None:
        processing_role = sagemaker.session.get_execution_role(sagemaker_session)
    if training_role is None:
        training_role = sagemaker.session.get_execution_role(sagemaker_session)
    if model_bucket is None:
        model_bucket = sagemaker_session.default_bucket()

    print(f"Creating the pipeline '{pipeline_name}':")
    print(f"Parameters:{region}\n{security_group_ids}\n{subnets}\n{processing_role}\n\
    {training_role}\n{data_bucket}\n{model_bucket}\n{model_package_group_name}\n\
    {pipeline_name}\n{base_job_prefix}")

    # parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus", default_value="PendingManualApproval"
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"s3://{sagemaker_session.default_bucket()}/datasets/abalone-dataset.csv",
    )

    # configure network for encryption, network isolation and VPC configuration
    # Since the preprocessor job takes the data from S3, enable_network_isolation must be set to False
    # see https://github.com/aws/amazon-sagemaker-examples/issues/1689
    network_config = NetworkConfig(
        enable_network_isolation=False, 
        security_group_ids=security_group_ids.split(","),
        subnets=subnets.split(","),
        encrypt_inter_container_traffic=True)
    
    # processing step for feature engineering
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-abalone-preprocess",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config
    )
    
    step_process = ProcessingStep(
        name="PreprocessAbaloneData",
        processor=sklearn_processor,
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=["--input-data", input_data],
    )

    # training step for generating model artifacts
    model_path = f"s3://{model_bucket}/{base_job_prefix}/AbaloneTrain"
    image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/abalone-train",
        sagemaker_session=sagemaker_session,
        role=training_role,
        subnets=network_config.subnets,
        security_group_ids=network_config.security_group_ids,
        encrypt_inter_container_traffic=True,
        enable_network_isolation=False
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    
    step_train = TrainingStep(
        name="TrainAbaloneModel",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # processing step for evaluation
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-abalone-eval",
        sagemaker_session=sagemaker_session,
        role=processing_role,
        network_config=network_config
    )
    
    evaluation_report = PropertyFile(
        name="AbaloneEvaluationReport",
        output_name="evaluation",
        path="evaluation.json",
    )
    step_eval = ProcessingStep(
        name="EvaluateAbaloneModel",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "evaluate.py"),
        property_files=[evaluation_report],
    )

    # register model step that will be conditionally executed
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json"
        )
    )

    """
    There is a bug in RegisterModel implementation
    The RegisterModel step is implemented in the SDK as two steps, a _RepackModelStep and a _RegisterModelStep. 
    The _RepackModelStep runs a SKLearn training step in order to repack the model.tar.gz to include any custom inference code in the archive. 
    The _RegisterModelStep then registers the repacked model.
    
    The problem is that the _RepackModelStep does not propagate VPC configuration from the Estimator object:
    https://github.com/aws/sagemaker-python-sdk/blob/cdb633b3ab02398c3b77f5ecd2c03cdf41049c78/src/sagemaker/workflow/_utils.py#L88

    This cause the AccessDenied exception because repacker cannot access S3 bucket (all access which is not via VPC endpoint is bloked by the bucket policy)
    
    The issue is opened against SageMaker python SDK: https://github.com/aws/sagemaker-python-sdk/issues/2302
    """

    vpc_config = {
        "Subnets":network_config.subnets,
        "SecurityGroupIds":network_config.security_group_ids
    }

    step_register = RegisterModel(
        name="RegisterAbaloneModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
        vpc_config_override=vpc_config
    )

    # condition step for evaluating model quality and branching execution
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step=step_eval,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value"
        ),
        right=6.0,
    )
    step_cond = ConditionStep(
        name="CheckMSEAbaloneEvaluation",
        conditions=[cond_lte],
        if_steps=[step_register],
        else_steps=[],
    )

    # pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"

    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
def test_tuning_byo_estimator(sagemaker_session):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_name = registry(sagemaker_session.boto_session.region_name
                          ) + '/factorization-machines:1'
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(
                                                          prefix, 'train',
                                                          key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole',
                              train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session,
                              base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(
            estimator=estimator,
            base_tuning_job_name='byo',
            objective_metric_name='test:binary_classification_accuracy',
            hyperparameter_ranges=hyperparameter_ranges,
            max_jobs=2,
            max_parallel_jobs=2)

        tuner.fit({
            'train': s3_train_data,
            'test': s3_train_data
        },
                  include_cls_metadata=False)

        print('Started hyperparameter tuning job with name:' +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1,
                                 'ml.m4.xlarge',
                                 endpoint_name=best_training_job)
        predictor.serializer = _fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
Exemple #35
0
def get_pipeline(
    region,
    sagemaker_project_arn=None,
    role=None,
    default_bucket=None,
    model_package_group_name="restatePackageGroup",  # Choose any name
    pipeline_name="restate-p-XXXXXXXXX",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
    base_job_prefix="restate",  # Choose any name
):
    """Gets a SageMaker ML Pipeline instance working with on RE data.
    Args:
        region: AWS region to create and run the pipeline.
        role: IAM role to create and run steps and pipeline.
        default_bucket: the bucket to use for storing the artifacts
    Returns:
        an instance of a pipeline
    """
    sagemaker_session = get_session(region, default_bucket)
    if role is None:
        role = sagemaker.session.get_execution_role(sagemaker_session)

    # Parameters for pipeline execution
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.2xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value="PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )
    input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"",  # Change this to point to the s3 location of your raw input data.
    )

    data_sources = []
    # Sagemaker session
    sess = sagemaker_session

    # You can configure this with your own bucket name, e.g.
    # bucket = "my-bucket"
    bucket = sess.default_bucket()

    data_sources.append(
        ProcessingInput(
            input_name="restate-california",
            dataset_definition=DatasetDefinition(
                local_path="/opt/ml/processing/restate-california",
                data_distribution_type="FullyReplicated",
                # You can override below to point to other database or use different queries
                athena_dataset_definition=AthenaDatasetDefinition(
                    catalog="AwsDataCatalog",
                    database="restate",
                    query_string="SELECT * FROM restate.california_10",
                    output_s3_uri=f"s3://{bucket}/athena/",
                    output_format="PARQUET",
                ),
            ),
        )
    )

    print(f"Data Wrangler export storage bucket: {bucket}")

    # unique flow export ID
    flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
    flow_export_name = f"flow-{flow_export_id}"

    # Output name is auto-generated from the select node's ID + output name from the flow file.
    output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default"

    s3_output_prefix = f"export-{flow_export_name}/output"
    s3_output_path = f"s3://{bucket}/{s3_output_prefix}"
    print(f"Flow S3 export result path: {s3_output_path}")

    processing_job_output = ProcessingOutput(
        output_name=output_name,
        source="/opt/ml/processing/output",
        destination=s3_output_path,
        s3_upload_mode="EndOfJob",
    )

    # name of the flow file which should exist in the current notebook working directory
    flow_file_name = "sagemaker-pipeline/restate-athena-california.flow"

    # Load .flow file from current notebook working directory
    #!echo "Loading flow file from current notebook working directory: $PWD"

    with open(flow_file_name) as f:
        flow = json.load(f)

    # Upload flow to S3
    s3_client = boto3.client("s3")
    s3_client.upload_file(
        flow_file_name,
        bucket,
        f"data_wrangler_flows/{flow_export_name}.flow",
        ExtraArgs={"ServerSideEncryption": "aws:kms"},
    )

    flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow"

    print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}")

    ## Input - Flow: restate-athena-russia.flow
    flow_input = ProcessingInput(
        source=flow_s3_uri,
        destination="/opt/ml/processing/flow",
        input_name="flow",
        s3_data_type="S3Prefix",
        s3_input_mode="File",
        s3_data_distribution_type="FullyReplicated",
    )

    # IAM role for executing the processing job.
    iam_role = role

    # Unique processing job name. Give a unique name every time you re-execute processing jobs
    processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}"

    # Data Wrangler Container URL.
    container_uri = sagemaker.image_uris.retrieve(
        framework="data-wrangler",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
    )

    # Processing Job Instance count and instance type.
    instance_count = 2
    instance_type = "ml.m5.4xlarge"

    # Size in GB of the EBS volume to use for storing data during processing
    volume_size_in_gb = 30

    # Content type for each output. Data Wrangler supports CSV as default and Parquet.
    output_content_type = "CSV"

    # Network Isolation mode; default is off
    enable_network_isolation = False

    # List of tags to be passed to the processing job
    user_tags = []

    # Output configuration used as processing job container arguments
    output_config = {output_name: {"content_type": output_content_type}}

    # KMS key for per object encryption; default is None
    kms_key = None

    processor = Processor(
        role=iam_role,
        image_uri=container_uri,
        instance_count=instance_count,
        instance_type=instance_type,
        volume_size_in_gb=volume_size_in_gb,
        network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),
        sagemaker_session=sess,
        output_kms_key=kms_key,
        tags=user_tags,
    )

    data_wrangler_step = ProcessingStep(
        name="DataWranglerProcess",
        processor=processor,
        inputs=[flow_input] + data_sources,
        outputs=[processing_job_output],
        job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
    )

    # Processing step for feature engineering
    # this processor does not have awswrangler installed
    sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{base_job_prefix}/sklearn-restate-preprocess",  # choose any name
        sagemaker_session=sagemaker_session,
        role=role,
    )

    step_process = ProcessingStep(
        name="Preprocess",  # choose any name
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(
                source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                    output_name
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/data/raw-data-dir",
            )
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(BASE_DIR, "preprocess.py"),
        job_arguments=[
            "--input-data",
            data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
                output_name
            ].S3Output.S3Uri,
        ],
    )

    # Training step for generating model artifacts
    model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/restateTrain"
    cache_config = CacheConfig(enable_caching=True, expire_after="30d")

    xgb_image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",  # we are using the Sagemaker built in xgboost algorithm
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=xgb_image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        base_job_name=f"{base_job_prefix}/restate-xgb-train",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    xgb_train.set_hyperparameters(
        #    #objective="binary:logistic",
        #    objective="reg:linear",
        num_round=50,
        #    max_depth=5,
        #    eta=0.2,
        #    gamma=4,
        #    min_child_weight=6,
        #    subsample=0.7,
        #    silent=0,
    )

    xgb_train.set_hyperparameters(grow_policy="lossguide")

    xgb_objective_metric_name = "validation:mse"
    xgb_hyperparameter_ranges = {
        "max_depth": IntegerParameter(2, 10, scaling_type="Linear"),
    }

    xgb_tuner_log = HyperparameterTuner(
        xgb_train,
        xgb_objective_metric_name,
        xgb_hyperparameter_ranges,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    xgb_step_tuning = TuningStep(
        name="XGBHPTune",
        tuner=xgb_tuner_log,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    # dtree_image_uri = '625467769535.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-decision-tree:latest'
    dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version(
        ImageName="restate-dtree"
    )["ContainerImage"]

    dtree_train = Estimator(
        image_uri=dtree_image_uri,
        role=role,
        instance_count=1,
        instance_type=training_instance_type,
        base_job_name=f"{base_job_prefix}/restate-dtree-train",
        output_path=model_path,
        sagemaker_session=sagemaker_session,
    )

    dtree_objective_metric_name = "validation:mse"
    dtree_metric_definitions = [{"Name": "validation:mse", "Regex": "mse:(\S+)"}]

    dtree_hyperparameter_ranges = {
        "max_depth": IntegerParameter(10, 50, scaling_type="Linear"),
        "max_leaf_nodes": IntegerParameter(2, 12, scaling_type="Linear"),
    }

    dtree_tuner_log = HyperparameterTuner(
        dtree_train,
        dtree_objective_metric_name,
        dtree_hyperparameter_ranges,
        dtree_metric_definitions,
        max_jobs=3,
        max_parallel_jobs=3,
        strategy="Random",
        objective_type="Minimize",
    )

    dtree_step_tuning = TuningStep(
        name="DTreeHPTune",
        tuner=dtree_tuner_log,
        inputs={
            "training": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
        cache_config=cache_config,
    )

    dtree_script_eval = ScriptProcessor(
        image_uri=dtree_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-dtree-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    dtree_evaluation_report = PropertyFile(
        name="EvaluationReportDTree",
        output_name="dtree_evaluation",
        path="dtree_evaluation.json",
    )

    dtree_step_eval = ProcessingStep(
        name="DTreeEval",
        processor=dtree_script_eval,
        inputs=[
            ProcessingInput(
                # source=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts,
                source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(
                output_name="dtree_evaluation", source="/opt/ml/processing/evaluation"
            ),
        ],
        code=os.path.join(BASE_DIR, "dtree_evaluate.py"),
        property_files=[dtree_evaluation_report],
    )

    xgb_script_eval = ScriptProcessor(
        image_uri=xgb_image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{base_job_prefix}/script-xgb-eval",
        sagemaker_session=sagemaker_session,
        role=role,
    )

    xgb_evaluation_report = PropertyFile(
        name="EvaluationReportXGBoost",
        output_name="xgb_evaluation",
        path="xgb_evaluation.json",
    )

    xgb_step_eval = ProcessingStep(
        name="XGBEval",
        processor=xgb_script_eval,
        inputs=[
            ProcessingInput(
                source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(BASE_DIR, "xgb_evaluate.py"),
        property_files=[xgb_evaluation_report],
    )

    xgb_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/xgb_evaluation.json".format(
                xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )

    dtree_model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/dtree_evaluation.json".format(
                dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][
                    "S3Uri"
                ]
            ),
            content_type="application/json",
        )
    )

    xgb_eval_metrics = JsonGet(
        step=xgb_step_eval,
        property_file=xgb_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    dtree_eval_metrics = JsonGet(
        step=dtree_step_eval,
        property_file=dtree_evaluation_report,
        json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
    )

    # Register model step that will be conditionally executed
    dtree_step_register = RegisterModel(
        name="DTreeReg",
        estimator=dtree_train,
        model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=dtree_model_metrics,
    )

    # Register model step that will be conditionally executed
    xgb_step_register = RegisterModel(
        name="XGBReg",
        estimator=xgb_train,
        model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=xgb_model_metrics,
    )

    # Condition step for evaluating model quality and branching execution
    cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
        left=JsonGet(
            step=dtree_step_eval,
            property_file=dtree_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),
        right=JsonGet(
            step=xgb_step_eval,
            property_file=xgb_evaluation_report,
            json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
        ),  # You can change the threshold here
    )

    step_cond = ConditionStep(
        name="AccuracyCond",
        conditions=[cond_lte],
        if_steps=[dtree_step_register],
        else_steps=[xgb_step_register],
    )
    create_date = time.strftime("%Y-%m-%d-%H-%M-%S")

    # Pipeline instance
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data
        ],
        pipeline_experiment_config=PipelineExperimentConfig(
            pipeline_name + "-" + create_date, "restate-{}".format(create_date)
        ),
        steps=[
            data_wrangler_step,
            step_process,
            dtree_step_tuning,
            xgb_step_tuning,
            dtree_step_eval,
            xgb_step_eval,
            step_cond,
        ],
        sagemaker_session=sagemaker_session,
    )
    return pipeline
Exemple #36
0
hyperparameters = {
    "epochs": "50",
    "time_freq": freq,
    "prediction_length": str(prediction_length),
    "context_length": str(context_length),
    "num_cells": "50",
    "num_layers": "2",
    "mini_batch_size": "128",
    "learning_rate": "0.001",
    "early_stopping_patience": "10"
}

# In[29]:

# set the hyperparams
estimator.set_hyperparameters(**hyperparameters)

# ## Training Job
#
# Now, we are ready to launch the training job! SageMaker will start an EC2 instance, download the data from S3, start training the model and save the trained model.
#
# If you provide the `test` data channel, as we do in this example, DeepAR will also calculate accuracy metrics for the trained model on this test data set. This is done by predicting the last `prediction_length` points of each time series in the test set and comparing this to the *actual* value of the time series. The computed error metrics will be included as part of the log output.
#
# The next cell may take a few minutes to complete, depending on data size, model complexity, and training options.

# In[30]:

get_ipython().run_cell_magic(
    'time', '',
    '# train and test channels\ndata_channels = {\n    "train": train_path,\n    "test": test_path\n}\n\n# fit the estimator\nestimator.fit(inputs=data_channels)'
)
def test_tuning_byo_estimator(sagemaker_session):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.
    """
    image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1'
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)}

        tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo',
                                    objective_metric_name='test:binary_classification_accuracy',
                                    hyperparameter_ranges=hyperparameter_ranges,
                                    max_jobs=2, max_parallel_jobs=2)

        tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False)

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job)
        predictor.serializer = _fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None