def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] kmeans = KMeans( role=role, instance_count=INSTANCE_COUNT, instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) job_name = unique_name_from_base("kmeans-fsx") kmeans.fit(records, job_name=job_name) model_path, _ = kmeans.model_data.rsplit("/", 1) assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
def test_tuning_kmeans(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) test_records = kmeans.record_set(train_set[0][:100], channel='test') # specify which hp you want to optimize over hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10), 'mini_batch_size': IntegerParameter(10, 100), 'epochs': IntegerParameter(1, 2), 'init_method': CategoricalParameter(['kmeans++', 'random'])} objective_metric_name = 'test:msd' tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label['closest_cluster'] is not None assert record.label['distance_to_cluster'] is not None
def kmeans_estimator(sagemaker_session): kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 return kmeans
def test_record_set(sagemaker_session, cpu_instance_type): """Test the method ``AmazonAlgorithmEstimatorBase.record_set``. In particular, test that the objects uploaded to the S3 bucket are encrypted. """ kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) record_set = kmeans.record_set(datasets.one_p_mnist()[0][:100], encrypt=True) parsed_url = urlparse(record_set.s3_data) s3_client = sagemaker_session.boto_session.client("s3") head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip("/")) assert head["ServerSideEncryption"] == "AES256"
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup.file_system_fsx_id train_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) test_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, channel="test", ) job_name = unique_name_from_base("tune-kmeans-fsx") tuner.fit([train_records, test_records], job_name=job_name) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_record_set(sagemaker_session): """Test the method ``AmazonAlgorithmEstimatorBase.record_set``. In particular, test that the objects uploaded to the S3 bucket are encrypted. """ data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as file_object: train_set, _, _ = pickle.load(file_object, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session) record_set = kmeans.record_set(train_set[0][:100], encrypt=True) parsed_url = urlparse(record_set.s3_data) s3_client = sagemaker_session.boto_session.client('s3') head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip('/')) assert head['ServerSideEncryption'] == 'AES256'
def test_record_set(sagemaker_session): """Test the method ``AmazonAlgorithmEstimatorBase.record_set``. In particular, test that the objects uploaded to the S3 bucket are encrypted. """ data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(data_path, "rb") as file_object: train_set, _, _ = pickle.load(file_object, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type="ml.c4.xlarge", k=10, sagemaker_session=sagemaker_session, ) record_set = kmeans.record_set(train_set[0][:100], encrypt=True) parsed_url = urlparse(record_set.s3_data) s3_client = sagemaker_session.boto_session.client("s3") head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip("/")) assert head["ServerSideEncryption"] == "AES256"
def test_tuning_step(sfn_client, record_set_for_hyperparameter_tuning, sagemaker_role_arn, sfn_role_arn): job_name = generate_job_name() kmeans = KMeans(role=sagemaker_role_arn, instance_count=1, instance_type=INSTANCE_TYPE, k=10) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name="test:msd", hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=2, max_parallel_jobs=2, ) # Build workflow definition tuning_step = TuningStep('Tuning', tuner=tuner, job_name=job_name, data=record_set_for_hyperparameter_tuning) tuning_step.add_retry(SAGEMAKER_RETRY_STRATEGY) workflow_graph = Chain([tuning_step]) with timeout(minutes=DEFAULT_TIMEOUT_MINUTES): # Create workflow and check definition workflow = create_workflow_and_check_definition( workflow_graph=workflow_graph, workflow_name=unique_name_from_base( "integ-test-tuning-step-workflow"), sfn_client=sfn_client, sfn_role_arn=sfn_role_arn) # Execute workflow execution = workflow.execute() execution_output = execution.get_output(wait=True) # Check workflow output assert execution_output.get( "HyperParameterTuningJobStatus") == "Completed" # Cleanup state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
def kmeans_estimator(sagemaker_session, cpu_instance_type): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 return kmeans
def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] records = kmeans.record_set(train_set[0][:100]) training_config = _build_airflow_workflow( estimator=kmeans, instance_type=cpu_instance_type, inputs=records ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def test_attach_transform_kmeans(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(kmeans, transform_input) attached_transformer = Transformer.attach(transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) attached_transformer.wait()
"http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") f = gzip.open('mnist.pkl.gz', 'rb') train_set, valid_set, test_set = pickle.load(f, encoding="latin1") f.close() return train_set, valid_set, test_set if __name__ == "__main__": # get MNIST dataset train_set, valid_set, test_set = get_mnist_dataset() # create model using built-in k-means algorithm kmeans = KMeans( role=ROLE, train_instance_count=1, #train_instance_type='local', train_instance_type='ml.c4.4xlarge', output_path=OUTPUT_PATH, k=10) # train model kmeans.fit(kmeans.record_set(train_set[0])) # deploy model to endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=2, instance_type='ml.m4.xlarge', endpoint_name=ENDPOINT_NAME) # test model input_set = test_set clustered_data = [[] for i in range(0, 10)] for i in range(0, len(input_set[0])):
def test_async_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_kmeans(): with timeout(minutes=15): sagemaker_session = sagemaker.Session(boto_session=boto3.Session( region_name=REGION)) data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.fit(kmeans.record_set(train_set[0][:100])) endpoint_name = name_from_base('kmeans') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20): model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_attach_transform_kmeans(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base('test-kmeans-attach') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( kmeans, transform_input) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): attached_transformer.wait()
def test_async_kmeans(sagemaker_session): training_job_name = "" endpoint_name = name_from_base('kmeans') with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False) training_job_name = kmeans.latest_training_job.name print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35): estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
# In[ ]: # from time import gmtime, strftime # job_name = 'KMeans-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) # print("Training job", job_name) # In[ ]: from sagemaker import KMeans kmeans = KMeans(role=role, train_instance_count=2, train_instance_type='ml.c4.8xlarge', output_path="s3://2018-10-08-batch-test", k=10, data_location=trainURL) # Use the high-level SDK # In[ ]: get_ipython().run_cell_magic('time', '', '\nkmeans.fit(kmeans.record_set(train_set[0]))') # In[ ]:
def process(ticker, local_data_folder, bucket, role, prefix, sagemaker_session): df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl')) df.dropna(inplace=True) df.drop(columns=["Date"], inplace=True) df.loc[df.Label >= threshold, 'direction'] = BUY df.loc[df.Label <= -threshold, 'direction'] = SELL df.loc[(df.Label < threshold) & (df.Label > -threshold), 'direction'] = NONE # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(df["Label"]).astype('float64') X_df = df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) X[:, X.shape[1] - 1] = X_df["direction"].to_numpy() #### split data x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) # Remove direction column and train kmeans.fit( kmeans.record_set(x_train[:, 0:x_train.shape[1] - 1].astype('float32'))) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}/s3/{}'.format(local_data_folder, ticker)) ''' Label = Change in price(+ve, -ve, none) Direction = BUY, SELL, NONE Cluster = cluster_0, cluster_1, cluster_2 ''' # train data y_train_df = pd.DataFrame(y_train, columns=["Label"]) x_train_df = pd.DataFrame( x_train, columns=['col-{}'.format(i) for i in range(x_train.shape[1] - 1)] + ["direction"]) dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\ clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format( local_data_folder, ticker), header=True, index=False) # test data y_test_df = pd.DataFrame(y_test, columns=["Label"]) x_test_df = pd.DataFrame( x_test, columns=['col-{}'.format(i) for i in range(x_test.shape[1] - 1)] + ['direction']) pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\ .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False) # clean clustering end point kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format( local_data_folder, ticker)).dropna() all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format( local_data_folder, ticker)).dropna() cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 0].drop(columns=["Cluster"]) save_data(cluster0_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster0_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction"]).astype('float32').values) all_train_pred["cluster0_pred"] = estimator.predict( all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 1].drop(columns=["Cluster"]) save_data(cluster1_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster1_pred"] = estimator.predict( all_test_pred.drop(columns=["Label", "direction", "cluster0_pred" ]).astype('float32').values) all_train_pred["cluster1_pred"] = estimator.predict( all_train_pred.drop( columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 2].drop(columns=["Cluster"]) save_data(cluster2_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster2_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) all_train_pred["cluster2_pred"] = estimator.predict( all_train_pred.drop(columns=[ "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) estimator.delete_endpoint(estimator.endpoint) os.remove(local_data_folder + '/s3/' + ticker + '/train.csv') os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv') all_buys = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == BUY].shape[0], cluster1_df[cluster1_df['direction'] == BUY].shape[0], cluster2_df[cluster2_df['direction'] == BUY].shape[0] ], columns=["BUY"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_sells = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == SELL].shape[0], cluster1_df[cluster1_df['direction'] == SELL].shape[0], cluster2_df[cluster2_df['direction'] == SELL].shape[0] ], columns=["SELL"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_nones = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == NONE].shape[0], cluster1_df[cluster1_df['direction'] == NONE].shape[0], cluster2_df[cluster2_df['direction'] == NONE].shape[0] ], columns=["NONE"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1) cluster_selection_index = cluster_selection_df.index buy_cluster_name = cluster_selection_index[ cluster_selection_df['BUY'].values.argmax()] sell_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name])['SELL'].values.argmax()] none_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()] # Generate selected-cluster column based on max(cluster0, cluster1, cluster2) all_test_pred["selected-cluster"] = all_test_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) all_train_pred["selected-cluster"] = all_train_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) # convert selected-cluster to BUY, SELL, NONE all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE # Bench mark results all_test_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_test_pred.shape[0]) ] all_train_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_train_pred.shape[0]) ] all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format( local_data_folder, ticker), index=None) all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format( local_data_folder, ticker), index=None) cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format( local_data_folder, ticker), index=None) # remove NA all_test_pred = all_test_pred.dropna() all_train_pred = all_train_pred.dropna() # test accuracy test_accuracy = accuracy_score(all_test_pred["direction"], all_test_pred["prediction"], normalize=True) benchmark_test_accuracy = accuracy_score( all_test_pred["direction"], all_test_pred["random-prediction"], normalize=True) print('Test accuracy:', test_accuracy, ", Benchmark:", benchmark_test_accuracy) # train accuracy train_accuracy = accuracy_score(all_train_pred["direction"], all_train_pred["prediction"], normalize=True) benchmark_train_accuracy = accuracy_score( all_train_pred["direction"], all_train_pred["random-prediction"], normalize=True) print('Train accuracy:', train_accuracy, ", Benchmark:", benchmark_train_accuracy) accuracy_df = pd.DataFrame([ ticker, test_accuracy, benchmark_test_accuracy, train_accuracy, benchmark_train_accuracy ]).T accuracy_df.columns = [ "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy", "benchmark_train_accuracy" ] accuracy_file = "{}/accuracy.csv".format(local_data_folder) header = not os.path.exists(accuracy_file) accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker): A_df = pd.read_pickle(local_data_folder + ticker + '.pkl') A_df.dropna(inplace=True) A_df.drop(columns=["Date"], inplace=True) # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(A_df["Label"]).astype('float64') X_df = A_df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) # split data print("Splitting data") x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) print("Clustering") kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values)) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}s3/{}'.format(local_data_folder, ticker)) # upload train and test data to S3 dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \ pd.DataFrame(x_train).astype("float32"),\ clustering(x_train, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False) # prepare cluster data sets create_dir('{}s3/{}/train'.format(local_data_folder, ticker)) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder) # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \ pd.DataFrame(x_test).astype("float32"),\ clustering(x_test, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False) # # prepare cluster data sets # create_dir('{}s3/{}/test'.format(local_data_folder, ticker)) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder) # delete endpoint kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) print('Completed clustering for', ticker)
buf.seek(0) boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf) ## 3.3.2 from sagemaker import KMeans data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket) output_location = 's3://{}/kmeans_highlevel_example/output'.format(bucket) print('training data will be uploaded to: {}'.format(data_location)) print('training artifacts will be uploaded to: {}'.format(output_location)) kmeans = KMeans(role=role, train_instance_count=2, train_instance_type='ml.c4.8xlarge', output_path=output_location, k=10,data_location=data_location) %%time kmeans.fit(kmeans.record_set(train_set[0])) 3.4.1 3 Steps High level - takes care of all these via the `deploy` method Low level - provides corresponding methods
def test_kmeans_airflow_config_uploads_data_source_to_s3( sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): kmeans = KMeans( role=ROLE, instance_count=SINGLE_INSTANCE_COUNT, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) training_config = _build_airflow_workflow( estimator=kmeans, instance_type=cpu_instance_type, inputs=records) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"] ["S3Uri"], )
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(training_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_kmeans(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), eval_metrics=json.dumps(kmeans.eval_metrics), force_dense="True", ) kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model( ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} tags = [{"Key": "some-tag", "Value": "value-for-tag"}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) estimator._enable_network_isolation = True transform_input_path = os.path.join(data_path, "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = estimator.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) assert model_desc["EnableNetworkIsolation"] model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc["ModelArn"])["Tags"] assert tags == model_tags
def test_tuning_kmeans(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='tk', output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) test_records = kmeans.record_set(train_set[0][:100], channel='test') # specify which hp you want to optimize over hyperparameter_ranges = { 'extra_center_factor': IntegerParameter(1, 10), 'mini_batch_size': IntegerParameter(10, 100), 'epochs': IntegerParameter(1, 2), 'init_method': CategoricalParameter(['kmeans++', 'random']) } objective_metric_name = 'test:msd' tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2, max_parallel_jobs=2) tuner.fit([records, test_records]) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label['closest_cluster'] is not None assert record.label['distance_to_cluster'] is not None
def test_transform_byo_estimator(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base('test-kmeans-attach') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags) transformer.transform(transform_input, content_type='text/csv') with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc['ModelArn'])['Tags'] assert tags == model_tags
subplot.axis('off') subplot.imshow(imgr, cmap='gray') plt.title(caption) show_digit(train_set[0][30], 'This is a {}'.format(train_set[1][30])) #4 from sagemaker import KMeans data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket) output_location = 's3://{}/kmeans_highlevel_example/output'.format(bucket) print('training data will be uploaded to: {}'.format(data_location)) print('training artifacts will be uploaded to: {}'.format(output_location)) kmeans = KMeans(role=role, # 훈련 결과 읽기 및 쓰기에 사용되는 사용자 IAM train_instance_count=2, # 모델 훈련에 사용할 인스턴스의 수 train_instance_type='ml.c4.8xlarge', # 모델 훈련에 사용할 인스턴스의 타입 output_path=output_location, # 훈련 결과를 저장할 위치 k=10, # 생성할 클러스터의 수, 0부터 9까지의 숫자 분류 문제이기에 10으로 설정 data_location=data_location) # 변환된 훈련 데이터를 업로드하는 Amazon S3의 위치 #5 %%time kmeans.fit(kmeans.record_set(train_set[0])) #6 %%time kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type='ml.t2.medium') #7
def gtext2vec(text): return text2vec(model, text) news_df['vectors'] = news_df.words.progress_apply(gtext2vec) ## Clustering and generating scatter X = np.concatenate(news_df['vectors'].values) ## run sagemaker kmeans role = get_execution_role() num_clusters = 10 kmeans = KMeans( role=role, train_instance_count=1, train_instance_type="ml.m5.4xlarge", output_path="s3://" + bucket + "/news_kmeans/", k=num_clusters, ) kmeans.fit(kmeans.record_set(X)) ## deploy sagemaker kmeans endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium") news_df['cluster'] = kmeans_predictor.predict(X) ## Save News news_df = news_df.drop(["ori_text", "words"], axis=1) news_df.to_pickle('news_df.pkl') ## Save Model
# <img src='notebook_ims/elbow_graph.png' width=50% /> # # A distance elbow can be seen around 8 when the distance starts to increase and then decrease at a slower rate. This indicates that there is enough separation to distinguish the data points in each cluster, but also that you included enough clusters so that the data points aren’t *extremely* far away from each cluster. # In[40]: # define a KMeans estimator # Solution from sagemaker import KMeans NUM_CLUSTERS = 8 kmeans = KMeans(role=role, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path=output_path, # using the same output path as was defined, earlier k=NUM_CLUSTERS) # ### EXERCISE: Create formatted, k-means training data # # Just as before, you should convert the `counties_transformed` df into a numpy array and then into a RecordSet. This is the required format for passing training data into a `KMeans` model. # In[41]: # convert the transformed dataframe into record_set data #Solution kmeans_train_data_np = counties_transformed.values.astype('float32') kmeans_formatted_data = kmeans.record_set(kmeans_train_data_np)
def test_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), eval_metrics=json.dumps(kmeans.eval_metrics), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(DATA_DIR, "one_p_mnist", "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( kmeans, transform_input, cpu_instance_type) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): attached_transformer.wait()
train_data = counties_transformed.values.astype('float32') # First, we call and define the hyperparameters of our KMeans model as we have done with our PCA model. The Kmeans algorithm allows the user to specify how many clusters to identify. In this instance, let's try to find the top 7 clusters from our dataset. # In[33]: from sagemaker import KMeans num_clusters = 7 kmeans = KMeans(role=role, train_instance_count=1, train_instance_type='ml.c4.xlarge', output_path='s3://'+ bucket_name +'/counties/', k=num_clusters) # Then we train the model on our training data. # In[34]: get_ipython().run_cell_magic('time', '', 'kmeans.fit(kmeans.record_set(train_data))') # Now we deploy the model and we can pass in the original training set to get the labels for each entry. This will give us which cluster each county belongs to. # In[35]:
def test_async_kmeans(sagemaker_session): training_job_name = "" endpoint_name = name_from_base('kmeans') with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense='True', ) kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False) training_job_name = kmeans.latest_training_job.name print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None