def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] kmeans = KMeans( role=role, instance_count=INSTANCE_COUNT, instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) job_name = unique_name_from_base("kmeans-fsx") kmeans.fit(records, job_name=job_name) model_path, _ = kmeans.model_data.rsplit("/", 1) assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
def test_kmeans(sagemaker_session): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type="ml.c4.xlarge", k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model(ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_kmeans(sagemaker_session): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.fit(kmeans.record_set(train_set[0][:100])) endpoint_name = name_from_base('kmeans') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} tags = [{"Key": "some-tag", "Value": "value-for-tag"}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) estimator._enable_network_isolation = True transform_input_path = os.path.join(data_path, "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix ) transformer = estimator.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES ): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name ) assert model_desc["EnableNetworkIsolation"] model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc["ModelArn"] )["Tags"] assert tags == model_tags
def test_async_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = KMeansModel( estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_async_kmeans(): training_job_name = "" endpoint_name = name_from_base('kmeans') with timeout(minutes=5): sagemaker_session = sagemaker.Session(boto_session=boto3.Session( region_name=REGION)) data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False) training_job_name = kmeans.latest_training_job.name print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35): estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_kmeans_serverless_inference(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("kmeans-serverless") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), eval_metrics=json.dumps(kmeans.eval_metrics), force_dense="True", ) kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel(kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy( serverless_inference_config=ServerlessInferenceConfig(), endpoint_name=job_name) result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None predictor.delete_model() with pytest.raises(Exception) as exception: sagemaker_session.sagemaker_client.describe_model( ModelName=model.name) assert "Could not find model" in str(exception.value)
def test_transform_byo_estimator(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base('test-kmeans-attach') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags) transformer.transform(transform_input, content_type='text/csv') with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc['ModelArn'])['Tags'] assert tags == model_tags
def test_async_kmeans(sagemaker_session): training_job_name = "" endpoint_name = name_from_base('kmeans') with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans') kmeans.init_method = 'random' kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense='True', ) kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False) training_job_name = kmeans.latest_training_job.name print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(train_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, ) kmeans.init_method = "random" kmeans.max_iterations = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 kmeans.center_factor = 1 assert kmeans.hyperparameters() == dict( init_method=kmeans.init_method, local_lloyd_max_iter=str(kmeans.max_iterations), local_lloyd_tol=str(kmeans.tol), local_lloyd_num_trials=str(kmeans.num_trials), local_lloyd_init_method=kmeans.local_init_method, half_life_time_size=str(kmeans.half_life_time_size), epochs=str(kmeans.epochs), extra_center_factor=str(kmeans.center_factor), k=str(kmeans.k), force_dense="True", ) kmeans.fit(kmeans.record_set(training_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) print("attaching now...") with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = KMeansModel(estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["closest_cluster"] is not None assert record.label["distance_to_cluster"] is not None
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(data_path, "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( kmeans, transform_input, cpu_instance_type) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): attached_transformer.wait()
def test_attach_transform_kmeans(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( kmeans, transform_input) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): attached_transformer.wait()
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): kmeans = KMeans( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(DATA_DIR, "one_p_mnist", "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job( kmeans, transform_input, cpu_instance_type) attached_transformer = Transformer.attach( transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): attached_transformer.wait()
def test_attach_transform_kmeans(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format(sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path, key_prefix=transform_input_key_prefix) transformer = _create_transformer_and_transform_job(kmeans, transform_input) attached_transformer = Transformer.attach(transformer.latest_transform_job.name, sagemaker_session=sagemaker_session) attached_transformer.wait()
if __name__ == "__main__": # get MNIST dataset train_set, valid_set, test_set = get_mnist_dataset() # create model using built-in k-means algorithm kmeans = KMeans( role=ROLE, train_instance_count=1, #train_instance_type='local', train_instance_type='ml.c4.4xlarge', output_path=OUTPUT_PATH, k=10) # train model kmeans.fit(kmeans.record_set(train_set[0])) # deploy model to endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=2, instance_type='ml.m4.xlarge', endpoint_name=ENDPOINT_NAME) # test model input_set = test_set clustered_data = [[] for i in range(0, 10)] for i in range(0, len(input_set[0])): result = kmeans_predictor.predict(input_set[0][i].reshape(1, 784))[0] predicted_cluster = int( result.label['closest_cluster'].float32_tensor.values[0]) clustered_data[predicted_cluster].append(i)
def process(ticker, local_data_folder, bucket, role, prefix, sagemaker_session): df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl')) df.dropna(inplace=True) df.drop(columns=["Date"], inplace=True) df.loc[df.Label >= threshold, 'direction'] = BUY df.loc[df.Label <= -threshold, 'direction'] = SELL df.loc[(df.Label < threshold) & (df.Label > -threshold), 'direction'] = NONE # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(df["Label"]).astype('float64') X_df = df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) X[:, X.shape[1] - 1] = X_df["direction"].to_numpy() #### split data x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) # Remove direction column and train kmeans.fit( kmeans.record_set(x_train[:, 0:x_train.shape[1] - 1].astype('float32'))) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}/s3/{}'.format(local_data_folder, ticker)) ''' Label = Change in price(+ve, -ve, none) Direction = BUY, SELL, NONE Cluster = cluster_0, cluster_1, cluster_2 ''' # train data y_train_df = pd.DataFrame(y_train, columns=["Label"]) x_train_df = pd.DataFrame( x_train, columns=['col-{}'.format(i) for i in range(x_train.shape[1] - 1)] + ["direction"]) dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\ clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format( local_data_folder, ticker), header=True, index=False) # test data y_test_df = pd.DataFrame(y_test, columns=["Label"]) x_test_df = pd.DataFrame( x_test, columns=['col-{}'.format(i) for i in range(x_test.shape[1] - 1)] + ['direction']) pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\ .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False) # clean clustering end point kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format( local_data_folder, ticker)).dropna() all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format( local_data_folder, ticker)).dropna() cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 0].drop(columns=["Cluster"]) save_data(cluster0_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster0_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction"]).astype('float32').values) all_train_pred["cluster0_pred"] = estimator.predict( all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 1].drop(columns=["Cluster"]) save_data(cluster1_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster1_pred"] = estimator.predict( all_test_pred.drop(columns=["Label", "direction", "cluster0_pred" ]).astype('float32').values) all_train_pred["cluster1_pred"] = estimator.predict( all_train_pred.drop( columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 2].drop(columns=["Cluster"]) save_data(cluster2_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster2_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) all_train_pred["cluster2_pred"] = estimator.predict( all_train_pred.drop(columns=[ "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) estimator.delete_endpoint(estimator.endpoint) os.remove(local_data_folder + '/s3/' + ticker + '/train.csv') os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv') all_buys = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == BUY].shape[0], cluster1_df[cluster1_df['direction'] == BUY].shape[0], cluster2_df[cluster2_df['direction'] == BUY].shape[0] ], columns=["BUY"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_sells = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == SELL].shape[0], cluster1_df[cluster1_df['direction'] == SELL].shape[0], cluster2_df[cluster2_df['direction'] == SELL].shape[0] ], columns=["SELL"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_nones = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == NONE].shape[0], cluster1_df[cluster1_df['direction'] == NONE].shape[0], cluster2_df[cluster2_df['direction'] == NONE].shape[0] ], columns=["NONE"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1) cluster_selection_index = cluster_selection_df.index buy_cluster_name = cluster_selection_index[ cluster_selection_df['BUY'].values.argmax()] sell_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name])['SELL'].values.argmax()] none_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()] # Generate selected-cluster column based on max(cluster0, cluster1, cluster2) all_test_pred["selected-cluster"] = all_test_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) all_train_pred["selected-cluster"] = all_train_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) # convert selected-cluster to BUY, SELL, NONE all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE # Bench mark results all_test_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_test_pred.shape[0]) ] all_train_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_train_pred.shape[0]) ] all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format( local_data_folder, ticker), index=None) all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format( local_data_folder, ticker), index=None) cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format( local_data_folder, ticker), index=None) # remove NA all_test_pred = all_test_pred.dropna() all_train_pred = all_train_pred.dropna() # test accuracy test_accuracy = accuracy_score(all_test_pred["direction"], all_test_pred["prediction"], normalize=True) benchmark_test_accuracy = accuracy_score( all_test_pred["direction"], all_test_pred["random-prediction"], normalize=True) print('Test accuracy:', test_accuracy, ", Benchmark:", benchmark_test_accuracy) # train accuracy train_accuracy = accuracy_score(all_train_pred["direction"], all_train_pred["prediction"], normalize=True) benchmark_train_accuracy = accuracy_score( all_train_pred["direction"], all_train_pred["random-prediction"], normalize=True) print('Train accuracy:', train_accuracy, ", Benchmark:", benchmark_train_accuracy) accuracy_df = pd.DataFrame([ ticker, test_accuracy, benchmark_test_accuracy, train_accuracy, benchmark_train_accuracy ]).T accuracy_df.columns = [ "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy", "benchmark_train_accuracy" ] accuracy_file = "{}/accuracy.csv".format(local_data_folder) header = not os.path.exists(accuracy_file) accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker): A_df = pd.read_pickle(local_data_folder + ticker + '.pkl') A_df.dropna(inplace=True) A_df.drop(columns=["Date"], inplace=True) # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(A_df["Label"]).astype('float64') X_df = A_df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) # split data print("Splitting data") x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) print("Clustering") kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values)) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}s3/{}'.format(local_data_folder, ticker)) # upload train and test data to S3 dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \ pd.DataFrame(x_train).astype("float32"),\ clustering(x_train, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False) # prepare cluster data sets create_dir('{}s3/{}/train'.format(local_data_folder, ticker)) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder) # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \ pd.DataFrame(x_test).astype("float32"),\ clustering(x_test, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False) # # prepare cluster data sets # create_dir('{}s3/{}/test'.format(local_data_folder, ticker)) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder) # delete endpoint kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) print('Completed clustering for', ticker)
news_df['vectors'] = news_df.words.progress_apply(gtext2vec) ## Clustering and generating scatter X = np.concatenate(news_df['vectors'].values) ## run sagemaker kmeans role = get_execution_role() num_clusters = 10 kmeans = KMeans( role=role, train_instance_count=1, train_instance_type="ml.m5.4xlarge", output_path="s3://" + bucket + "/news_kmeans/", k=num_clusters, ) kmeans.fit(kmeans.record_set(X)) ## deploy sagemaker kmeans endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium") news_df['cluster'] = kmeans_predictor.predict(X) ## Save News news_df = news_df.drop(["ori_text", "words"], axis=1) news_df.to_pickle('news_df.pkl') ## Save Model import pickle pkl_filename = "model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(kmeans, file)
data_path = "s3://ressonance/data/model_data/" output_path = "s3://ressonance/models/" # portfolio clustering port_kmeans = KMeans(role=role, train_instance_count=2, train_instance_type="ml.c4.xlarge", output_path=output_path + "portfolio", k=5, data_location=data_path + "portfolios.csv") port_training = pd.read_csv("data/training_data/portfolios.csv") port_kmeans.fit(port_kmeans.record_set(port_training)) port_predictor = port_kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") ## Step 2: people # Substiuting portfolios def sub_port(port): return port_predictor(portfolio_processing(list(port))) clis = None clis_df = client_processing(clis) clis_df.portfolio = sub_port(clis_df.portfolio)