def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        kmeans = KMeans(
            role=ROLE,
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
def test_transform_byo_estimator(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}
    tags = [{"Key": "some-tag", "Value": "value-for-tag"}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
    estimator._enable_network_isolation = True

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix
    )

    transformer = estimator.transformer(1, cpu_instance_type, tags=tags)
    transformer.transform(transform_input, content_type="text/csv")

    with timeout_and_delete_model_with_transformer(
        transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES
    ):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name
        )
        assert model_desc["EnableNetworkIsolation"]

        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc["ModelArn"]
        )["Tags"]
        assert tags == model_tags
def test_kmeans(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('kmeans')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Esempio n. 4
0
def test_kmeans(sagemaker_session):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None

    predictor.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
        assert "Could not find model" in str(exception.value)
def test_async_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(
            estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session
        )
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_async_kmeans():

    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(
            region_name=REGION))
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        estimator = KMeans.attach(training_job_name=training_job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role='SageMakerRole',
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_kmeans_serverless_inference(sagemaker_session, cpu_instance_type,
                                     training_set):
    job_name = unique_name_from_base("kmeans-serverless")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(
            serverless_inference_config=ServerlessInferenceConfig(),
            endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
        predictor.delete_model()
        with pytest.raises(Exception) as exception:
            sagemaker_session.sagemaker_client.describe_model(
                ModelName=model.name)
            assert "Could not find model" in str(exception.value)
Esempio n. 8
0
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Esempio n. 9
0
def test_transform_byo_estimator(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
    tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base('test-kmeans-attach')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    estimator = Estimator.attach(training_job_name=job_name,
                                 sagemaker_session=sagemaker_session)

    transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags)
    transformer.transform(transform_input, content_type='text/csv')

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc['ModelArn'])['Tags']
        assert tags == model_tags
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Esempio n. 11
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=20):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Esempio n. 12
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Esempio n. 13
0
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
Esempio n. 14
0
def kmeans_estimator(sagemaker_session):
    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))
    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    return kmeans
Esempio n. 15
0
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
Esempio n. 16
0
def kmeans_estimator(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )
    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    return kmeans
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(DATA_DIR, "one_p_mnist",
                                        "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(train_set[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path,
                                                           key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(kmeans, transform_input)

    attached_transformer = Transformer.attach(transformer.latest_transform_job.name,
                                              sagemaker_session=sagemaker_session)
    attached_transformer.wait()