## Clustering and generating scatter X = np.concatenate(news_df['vectors'].values) ## run sagemaker kmeans role = get_execution_role() num_clusters = 10 kmeans = KMeans( role=role, train_instance_count=1, train_instance_type="ml.m5.4xlarge", output_path="s3://" + bucket + "/news_kmeans/", k=num_clusters, ) kmeans.fit(kmeans.record_set(X)) ## deploy sagemaker kmeans endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium") news_df['cluster'] = kmeans_predictor.predict(X) ## Save News news_df = news_df.drop(["ori_text", "words"], axis=1) news_df.to_pickle('news_df.pkl') ## Save Model import pickle pkl_filename = "model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(kmeans, file)
train_set, valid_set, test_set = get_mnist_dataset() # create model using built-in k-means algorithm kmeans = KMeans( role=ROLE, train_instance_count=1, #train_instance_type='local', train_instance_type='ml.c4.4xlarge', output_path=OUTPUT_PATH, k=10) # train model kmeans.fit(kmeans.record_set(train_set[0])) # deploy model to endpoint kmeans_predictor = kmeans.deploy(initial_instance_count=2, instance_type='ml.m4.xlarge', endpoint_name=ENDPOINT_NAME) # test model input_set = test_set clustered_data = [[] for i in range(0, 10)] for i in range(0, len(input_set[0])): result = kmeans_predictor.predict(input_set[0][i].reshape(1, 784))[0] predicted_cluster = int( result.label['closest_cluster'].float32_tensor.values[0]) clustered_data[predicted_cluster].append(i) for i in range(0, 10): print("Cluster " + str(i) + "\n" + "=" * 80) cnt = [0 for i in range(0, 10)] for data in clustered_data[i]:
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker): A_df = pd.read_pickle(local_data_folder + ticker + '.pkl') A_df.dropna(inplace=True) A_df.drop(columns=["Date"], inplace=True) # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(A_df["Label"]).astype('float64') X_df = A_df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) # split data print("Splitting data") x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) print("Clustering") kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values)) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}s3/{}'.format(local_data_folder, ticker)) # upload train and test data to S3 dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \ pd.DataFrame(x_train).astype("float32"),\ clustering(x_train, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False) # prepare cluster data sets create_dir('{}s3/{}/train'.format(local_data_folder, ticker)) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder) # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \ pd.DataFrame(x_test).astype("float32"),\ clustering(x_test, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False) # # prepare cluster data sets # create_dir('{}s3/{}/test'.format(local_data_folder, ticker)) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder) # delete endpoint kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) print('Completed clustering for', ticker)
def process(ticker, local_data_folder, bucket, role, prefix, sagemaker_session): df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl')) df.dropna(inplace=True) df.drop(columns=["Date"], inplace=True) df.loc[df.Label >= threshold, 'direction'] = BUY df.loc[df.Label <= -threshold, 'direction'] = SELL df.loc[(df.Label < threshold) & (df.Label > -threshold), 'direction'] = NONE # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(df["Label"]).astype('float64') X_df = df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) X[:, X.shape[1] - 1] = X_df["direction"].to_numpy() #### split data x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) # Remove direction column and train kmeans.fit( kmeans.record_set(x_train[:, 0:x_train.shape[1] - 1].astype('float32'))) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}/s3/{}'.format(local_data_folder, ticker)) ''' Label = Change in price(+ve, -ve, none) Direction = BUY, SELL, NONE Cluster = cluster_0, cluster_1, cluster_2 ''' # train data y_train_df = pd.DataFrame(y_train, columns=["Label"]) x_train_df = pd.DataFrame( x_train, columns=['col-{}'.format(i) for i in range(x_train.shape[1] - 1)] + ["direction"]) dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\ clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format( local_data_folder, ticker), header=True, index=False) # test data y_test_df = pd.DataFrame(y_test, columns=["Label"]) x_test_df = pd.DataFrame( x_test, columns=['col-{}'.format(i) for i in range(x_test.shape[1] - 1)] + ['direction']) pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\ .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False) # clean clustering end point kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format( local_data_folder, ticker)).dropna() all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format( local_data_folder, ticker)).dropna() cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 0].drop(columns=["Cluster"]) save_data(cluster0_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster0_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction"]).astype('float32').values) all_train_pred["cluster0_pred"] = estimator.predict( all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 1].drop(columns=["Cluster"]) save_data(cluster1_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster1_pred"] = estimator.predict( all_test_pred.drop(columns=["Label", "direction", "cluster0_pred" ]).astype('float32').values) all_train_pred["cluster1_pred"] = estimator.predict( all_train_pred.drop( columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype( 'float32').values) estimator.delete_endpoint(estimator.endpoint) cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 2].drop(columns=["Cluster"]) save_data(cluster2_df.drop(columns=["direction"]), ticker, local_data_folder) sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker) estimator = generate_NN_predictor(ticker, bucket, prefix, role, sagemaker_session) all_test_pred["cluster2_pred"] = estimator.predict( all_test_pred.drop( columns=["Label", "direction", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) all_train_pred["cluster2_pred"] = estimator.predict( all_train_pred.drop(columns=[ "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred" ]).astype('float32').values) estimator.delete_endpoint(estimator.endpoint) os.remove(local_data_folder + '/s3/' + ticker + '/train.csv') os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv') all_buys = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == BUY].shape[0], cluster1_df[cluster1_df['direction'] == BUY].shape[0], cluster2_df[cluster2_df['direction'] == BUY].shape[0] ], columns=["BUY"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_sells = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == SELL].shape[0], cluster1_df[cluster1_df['direction'] == SELL].shape[0], cluster2_df[cluster2_df['direction'] == SELL].shape[0] ], columns=["SELL"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) all_nones = pd.DataFrame( [ cluster0_df[cluster0_df['direction'] == NONE].shape[0], cluster1_df[cluster1_df['direction'] == NONE].shape[0], cluster2_df[cluster2_df['direction'] == NONE].shape[0] ], columns=["NONE"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"]) cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1) cluster_selection_index = cluster_selection_df.index buy_cluster_name = cluster_selection_index[ cluster_selection_df['BUY'].values.argmax()] sell_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name])['SELL'].values.argmax()] none_cluster_name = cluster_selection_index[cluster_selection_df.drop( index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()] # Generate selected-cluster column based on max(cluster0, cluster1, cluster2) all_test_pred["selected-cluster"] = all_test_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) all_train_pred["selected-cluster"] = all_train_pred[[ "cluster0_pred", "cluster1_pred", "cluster2_pred" ]].idxmax(axis=1) # convert selected-cluster to BUY, SELL, NONE all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE # Bench mark results all_test_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_test_pred.shape[0]) ] all_train_pred["random-prediction"] = [ generate_random_direction() for _ in range(all_train_pred.shape[0]) ] all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format( local_data_folder, ticker), index=None) all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format( local_data_folder, ticker), index=None) cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format( local_data_folder, ticker), index=None) # remove NA all_test_pred = all_test_pred.dropna() all_train_pred = all_train_pred.dropna() # test accuracy test_accuracy = accuracy_score(all_test_pred["direction"], all_test_pred["prediction"], normalize=True) benchmark_test_accuracy = accuracy_score( all_test_pred["direction"], all_test_pred["random-prediction"], normalize=True) print('Test accuracy:', test_accuracy, ", Benchmark:", benchmark_test_accuracy) # train accuracy train_accuracy = accuracy_score(all_train_pred["direction"], all_train_pred["prediction"], normalize=True) benchmark_train_accuracy = accuracy_score( all_train_pred["direction"], all_train_pred["random-prediction"], normalize=True) print('Train accuracy:', train_accuracy, ", Benchmark:", benchmark_train_accuracy) accuracy_df = pd.DataFrame([ ticker, test_accuracy, benchmark_test_accuracy, train_accuracy, benchmark_train_accuracy ]).T accuracy_df.columns = [ "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy", "benchmark_train_accuracy" ] accuracy_file = "{}/accuracy.csv".format(local_data_folder) header = not os.path.exists(accuracy_file) accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
k=10, data_location=data_location) #CODE-5--------------------------------------------------------------------------------------------- %%time kmeans.fit(kmeans.record_set(train_set[0])) #CODE-6--------------------------------------------------------------------------------------------- %%time kmeans_predictor = kmeans.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge') #CODE-7--------------------------------------------------------------------------------------------- import sagemaker from time import gmtime, strftime job_name = 'Batch-Transform-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) prefix = 'sagemaker/project_name' # Initialize the transformer object transformer =sagemaker.transformer.Transformer( model_name=model_name, instance_count=1, instance_type='ml.c4.xlarge',
data_path = "s3://ressonance/data/model_data/" output_path = "s3://ressonance/models/" # portfolio clustering port_kmeans = KMeans(role=role, train_instance_count=2, train_instance_type="ml.c4.xlarge", output_path=output_path + "portfolio", k=5, data_location=data_path + "portfolios.csv") port_training = pd.read_csv("data/training_data/portfolios.csv") port_kmeans.fit(port_kmeans.record_set(port_training)) port_predictor = port_kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") ## Step 2: people # Substiuting portfolios def sub_port(port): return port_predictor(portfolio_processing(list(port))) clis = None clis_df = client_processing(clis) clis_df.portfolio = sub_port(clis_df.portfolio) clis_df.to_csv(key + "clients.csv")