def test_tsne_knn_parameters(dataset, type_knn_graph, method): X = dataset.data from sklearn.preprocessing import normalize X = normalize(X, norm='l1') neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) tsne = TSNE(n_components=2, random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, learning_rate_method='none', method=method, min_grad_norm=1e-12, perplexity=DEFAULT_PERPLEXITY) embed = tsne.fit_transform(X, True, knn_graph) validate_embedding(X, embed) embed = tsne.fit_transform(X, True, knn_graph.tocoo()) validate_embedding(X, embed) embed = tsne.fit_transform(X, True, knn_graph.tocsc()) validate_embedding(X, embed)
def get_features(df): features_df = pd.DataFrame() sparse_matrix, index_values, columns_values = get_sparse_matrix( df.iloc[:5_000_000], index='content_id', columns='user_id', values='simple_count') content_matrix = sparse_matrix.toarray() tsne = TSNE(n_components=N_COMP, random_state=0) tsne_array = tsne.fit_transform(content_matrix) tsne_df = pd.DataFrame({'content_id': index_values}) for i in range(N_COMP): tsne_df[f'content_id_tsne_{i}'] = tsne_array[:, i] le = dict(tsne_df[['content_id', f'content_id_tsne_{i}']].values) features_df[f'content_id_tsne_{i}'] = df['content_id'].map(le) dh.save(f'../data/processed/dropped___tsne_encoder_{i}.pkl', le) features_df.columns = [f'dropped___{col}' for col in features_df.columns] return features_df
def test_tsne_fit_transform_on_digits_sparse(input_type, method): digits = test_datasets['digits'].data if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse fitter = TSNE(n_components=2, random_state=1, method=method, min_grad_norm=1e-12, n_neighbors=DEFAULT_N_NEIGHBORS, learning_rate_method="none", perplexity=DEFAULT_PERPLEXITY) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits)).astype('float32') embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits, embedding, n_neighbors=DEFAULT_N_NEIGHBORS) assert trust >= 0.85
def test_tsne_transform_on_digits_sparse(input_type): datasets digits = datasets.load_digits() digits_selection = np.random.RandomState(42).choice( [True, False], 1797, replace=True, p=[0.60, 0.40]) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse fitter = TSNE(2, n_neighbors=15, random_state=1, learning_rate=500, angle=0.8) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(digits.data[~digits_selection])) embedding = fitter.fit_transform(new_data, convert_dtype=True) if input_type == 'cupy': embedding = embedding.get() trust = trustworthiness(digits.data[~digits_selection], embedding, 15) assert trust >= 0.85
def apply_tsne(j): idx, md5, x = j if pca_preprocessing: x = PCA(n_components=None, random_state=random_state).fit_transform(x) tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x), tsne if return_model else None)
def test_tsne_default(name): datasets X = eval("datasets.load_{}".format(name))().data for i in range(3): print("iteration = ", i) tsne = TSNE() Y = tsne.fit_transform(X) check_embedding(X, Y) del Y
def test_tsne_large(nrows, ncols): """ This tests how TSNE handles large input """ X, y = make_blobs(n_samples=nrows, centers=8, n_features=ncols, random_state=0) X = X.astype(np.float32) tsne = TSNE(random_state=0, exaggeration_iter=1, n_iter=2) Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) assert nans == 0
def compute_tsne(data, overwrite_cache=False): start = time.perf_counter() tsne_file = f"citation_graph_deepwalk_tsne.pkl" if os.path.exists(tsne_file) and not overwrite_cache: data_embedded = pickle.load(open(tsne_file, 'rb')) else: visualizer = TSNE(perplexity=30, n_neighbors=90) data_embedded = visualizer.fit_transform(data) pickle.dump(data_embedded, open(tsne_file, 'wb')) end = time.perf_counter() print(f"Computing t-SNE took {end-start} seconds.") return data_embedded
def test_tsne_default(name): datasets X = eval("datasets.load_{}".format(name))().data for i in range(3): print("iteration = ", i) tsne = TSNE() Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y
def test_tsne(name): """ This tests how TSNE handles a lot of input data across time. (1) cuDF DataFrames are passed input (2) Numpy arrays are passed in (3) Params are changed in the TSNE class (4) The class gets re-used across time (5) Trustworthiness is checked (6) Tests NAN in TSNE output for learning rate explosions (7) Tests verbosity """ datasets X = eval("datasets.load_{}".format(name))().data X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X)) for i in range(3): print("iteration = ", i) tsne = TSNE(2, random_state=i, verbose=0, learning_rate=2 + i) Y = tsne.fit_transform(X_cudf).to_pandas().values nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Again tsne = TSNE(2, random_state=i + 2, verbose=1, learning_rate=2 + i + 2) Y = tsne.fit_transform(X_cudf).to_pandas().values nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type): datasets digits = datasets.load_digits() neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) digits_selection = np.random.RandomState(42).choice( [True, False], 1797, replace=True, p=[0.60, 0.40]) selected_digits = digits.data[~digits_selection] neigh.fit(selected_digits) knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance") if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse tsne = TSNE(2, n_neighbors=15, random_state=1, learning_rate=500, angle=0.8) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(selected_digits)) Y = tsne.fit_transform(new_data, True, knn_graph) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocoo()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocsc()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) del Y
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type, method): digits = test_datasets["digits"].data neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(digits) knn_graph = neigh.kneighbors_graph(digits, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse tsne = TSNE(n_components=2, n_neighbors=DEFAULT_N_NEIGHBORS, random_state=1, learning_rate_method='none', method=method, min_grad_norm=1e-12, perplexity=DEFAULT_PERPLEXITY) new_data = sp_prefix.csr_matrix(scipy.sparse.csr_matrix(digits)) Y = tsne.fit_transform(new_data, True, knn_graph) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocoo()) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocsc()) if input_type == 'cupy': Y = Y.get() validate_embedding(digits, Y, 0.85)
def test_tsne(name): """ This tests how TSNE handles a lot of input data across time. (1) Numpy arrays are passed in (2) Params are changed in the TSNE class (3) The class gets re-used across time (4) Trustworthiness is checked (5) Tests NAN in TSNE output for learning rate explosions (6) Tests verbosity """ datasets X = eval("datasets.load_{}".format(name))().data for i in range(3): print("iteration = ", i) tsne = TSNE(2, random_state=i, verbose=False, learning_rate=2+i) # Reuse Y = tsne.fit_transform(X) check_embedding(X, Y) del Y # Again tsne = TSNE(2, random_state=i+2, verbose=logger.level_debug, learning_rate=2+i+2) # Reuse Y = tsne.fit_transform(X) check_embedding(X, Y) del Y
def test_tsne_knn_graph_used(name, type_knn_graph): datasets X = eval("datasets.load_{}".format(name))().data neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) neigh.fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance") tsne = TSNE() # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y) print("Trust = ", trust_normal) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph(X_garbage, mode="distance") # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocoo()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocsc()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15
def test_tsne_knn_graph_used(dataset, type_knn_graph, method): X = dataset.data neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph( X_garbage, mode="distance").astype('float32') if type_knn_graph == 'cuml': knn_graph_garbage = cupyx.scipy.sparse.csr_matrix(knn_graph_garbage) tsne = TSNE(random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, method=method, perplexity=DEFAULT_PERPLEXITY, learning_rate_method='none', min_grad_norm=1e-12) # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS) assert (trust_normal - trust_garbage) > 0.15
def test_tsne(dataset, method): """ This tests how TSNE handles a lot of input data across time. (1) Numpy arrays are passed in (2) Params are changed in the TSNE class (3) The class gets re-used across time (4) Trustworthiness is checked (5) Tests NAN in TSNE output for learning rate explosions (6) Tests verbosity """ X = dataset.data tsne = TSNE(n_components=2, random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, learning_rate_method='none', method=method, min_grad_norm=1e-12, perplexity=DEFAULT_PERPLEXITY) Y = tsne.fit_transform(X) validate_embedding(X, Y)
def test_tsne_knn_parameters(name, type_knn_graph): datasets X = eval("datasets.load_{}".format(name))().data neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) neigh.fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance") for i in range(3): print("iteration = ", i) tsne = TSNE() Y = tsne.fit_transform(X, True, knn_graph) check_embedding(X, Y) Y = tsne.fit_transform(X, True, knn_graph.tocoo()) check_embedding(X, Y) Y = tsne.fit_transform(X, True, knn_graph.tocsc()) check_embedding(X, Y) del Y
def test_tsne(name): """ This tests how TSNE handles a lot of input data across time. (1) Numpy arrays are passed in (2) Params are changed in the TSNE class (3) The class gets re-used across time (4) Trustworthiness is checked (5) Tests NAN in TSNE output for learning rate explosions (6) Tests verbosity """ datasets X = eval("datasets.load_{}".format(name))().data for i in range(3): print("iteration = ", i) tsne = TSNE(2, random_state=i, verbosity=logger.LEVEL_INFO, learning_rate=2+i) # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y # Again tsne = TSNE(2, random_state=i+2, verbosity=logger.LEVEL_DEBUG, learning_rate=2+i+2) # Reuse Y = tsne.fit_transform(X) nans = np.sum(np.isnan(Y)) trust = trustworthiness(X, Y) print("Trust = ", trust) assert trust > 0.76 assert nans == 0 del Y
import numpy as np from cuml.manifold import TSNE from sklearn.linear_model import LinearRegression from helpers import load_mnist train_images, train_labels = load_mnist("./fashion", subset="train") test_images, test_labels = load_mnist("./fashion", subset="test") all_images = np.concatenate((train_images, test_images)) tsne = TSNE( n_components=2, method="barnes_hut", random_state=23, learning_rate=200, perplexity=50, n_iter=3000, ) train_X_hat = tsne.fit_transform(train_images) test_X_hat = tsne.fit_transform(test_images) all_X_hat = tsne.fit_transform(all_images) np.save("trained_data/train_tsne", train_X_hat) np.save("trained_data/test_tsne", test_X_hat) np.save("trained_data/all_images_tsne", all_X_hat) ################################# # infer approximate embeddings for new images:
norm_hist=True, bins=range(0, 100, 5)) sns.distplot(V2inV2, label="V2inV2", kde=False, norm_hist=True, bins=range(0, 100, 5)) plt.title( "Numbers of 100-nearest neighbor of the same type. \nEqual-ish retinotopy ($x>-10$) and equal number of neurons from V1 and V2" ) plt.legend() plt.show() # #%% tsne = TSNE(n_components=2, perplexity=40) X_hat = tsne.fit_transform(S_hat) #%% fig, ax = plt.subplots(figsize=(6, 6), dpi=300) ax.scatter(X_hat[:cutoff, 0], X_hat[:cutoff, 1], c="C0", alpha=0.5, s=5, linewidth=0, label="V1") ax.scatter(X_hat[cutoff:, 0], X_hat[cutoff:, 1], c="C1",
def test_components_exception(): with pytest.raises(ValueError): TSNE(n_components=3)
def apply_tsne(j): idx, md5, x = j tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x), tsne if return_model else None)
samples = rnd_state.choice(len(train_ds), 5000, replace=False) train_ds = Subset(train_ds, samples) # Extract embedding vectors load_kwargs = {'batch_size': 128, 'num_workers': os.cpu_count()} # test_embs, _ = extract_embeddings(emb_net, DataLoader(test_ds, **load_kwargs)) embs, labels = extract_embeddings(emb_net, DataLoader(train_ds, **load_kwargs)) # translate them to cpu + numpy embs = embs.cpu().numpy() labels = labels.cpu().numpy() # ----------------------------------------------------------------------------- print("Plotting T-sne....") from cuml.manifold import TSNE tsne = TSNE(n_iter=1000, metric="euclidean") projected_emb = tsne.fit_transform(embs) fig = plot_embeddings(projected_emb, labels) png_fname = join(exp_folder, 't-sne.png') fig.savefig(png_fname, bbox_inches='tight') pdf_fname = join(exp_folder, 't-sne.pdf') fig.savefig(pdf_fname, bbox_inches='tight') # ----------------------------------------------------------------------------- print("Plotting PCA....") from cuml import PCA pca_float = PCA(n_components=2) cudf = pca_float.fit_transform(embs) projected_emb = cudf.to_pandas().to_numpy() fig = plot_embeddings(projected_emb, labels) png_fname = join(exp_folder, 'pca.png') fig.savefig(png_fname, bbox_inches='tight')
# Step 3 model = SiameseNet(embedding_net) margin = 1. loss_fn = ContrastiveLoss(margin) lr = 1e-3 if has_cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1, last_epoch=-1) n_epochs = 20 log_interval = 50 fit(siamese_train_loader, siamese_test_loader, model, loss_fn, optimizer, scheduler, n_epochs, has_cuda, log_interval) # --------------------------------------------------------------------------- # Obtain the embeddings embeddings, labels = extract_embeddings(embedding_net, test_loader) tsne = TSNE(random_state=1, n_iter=1000, metric="euclidean") projected_emb = tsne.fit_transform(embeddings) fig = plot_embeddings(projected_emb, labels) fig.savefig('siamese.png', bbox_inches='tight')
''' To run the TSNE from cuml you need to create a virtual environment according to your PC configuration. visit the URL (https://rapids.ai/start.html#rapids-release-selector) e.g. conda create -n rapids-0.18 -c rapidsai-nightly -c nvidia -c conda-forge \ -c defaults blazingsql=0.18 cuml=0.18 python=3.7 cudatoolkit=11.0 ''' import numpy as np import matplotlib.pyplot as plt from cuml.manifold import TSNE from glob import glob files = glob('../feature_maps/cat_1/*.npy') for file, i in zip(files, range(len(files))): print('[INFO] Processing file {}/{} . . .'.format(i + 1, len(files))) with open(file, 'rb') as f: x = np.load(f) x = np.transpose(x) tsne = TSNE(n_components=2) X_hat = tsne.fit_transform(x) plt.scatter(X_hat[:, 0], X_hat[:, 1]) plt.show()
from cuml.manifold import TSNE import pickle def load_mnist_train(path): """Load MNIST data from path""" labels_path = os.path.join(path, 'train-labels-idx1-ubyte.gz') images_path = os.path.join(path, 'train-images-idx3-ubyte.gz') with gzip.open(labels_path, 'rb') as lbpath: labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8) with gzip.open(images_path, 'rb') as imgpath: images = np.frombuffer(imgpath.read(), dtype=np.uint8, offset=16).reshape(len(labels), 784) return images, labels images, labels = load_mnist_train("data/fashion") tsne = TSNE(n_components=2, method='barnes_hut', random_state=23) embedding = tsne.fit_transform(images) print(embedding[:10], embedding.shape, type(embedding)) outdir = '/opt/dkube/output/' if not os.path.exists(outdir + 'model'): os.makedirs(outdir + 'model') with open(outdir + 'model/embeddings.pickle', 'wb') as handle: pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)
help='Angular size. This is the trade-off between speed and accuracy.') parser.add_argument('--min-grad-norm', type=float, default=1e-7, help='If the gradient norm is below this threshold,' 'the optimization is stopped.') parser.add_argument('--random-state', type=int, default=1234) params = bench.parse_args(parser) # Load and convert data X, _, _, _ = bench.load_data(params) # Create our random forest regressor tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration, learning_rate=params.learning_rate, angle=params.angle, min_grad_norm=params.min_grad_norm, random_state=params.random_state) fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) # Need to investigate how to compare sklearn and cuml metrics for tsne bench.print_output(library='cuml', algorithm='tsne', stages=['training'], params=params, functions=['tsne.fit'], times=[fit_time], metric_type=None, metrics=None, data=[X],
def GMVP_between_clusters(data_period, max_cluster_size, scaling_method='none', dim_reduction_method='none', no_of_PCA_components = 0, no_of_tsne_components = 0): rebalancing_period = 60 ################## creating lists and dictionaries for storing outcomes ################## # 1) returns daily_portfolio_return_list = [] # daily returns of portfolio # 2) stdev in_sample_stdev_list = [] # standard deviation of in-sample (252-day-long) daily portfolio returns out_of_sample_stdev_list = [] # standard deviation of out-of-sample (60-day-long) daily portfolio returns # 3) cluster & stock cluster_and_stock_dict = {} # 1. cluster_return : 252_daily_returns * 11_clusters # 2. cluster_weight : weights of each 11 cluster # 3. cluster_ticker : tickers belonging to each cluster # 4. stock_weight : weights of each individual stock # 5. daily_return_for_viz ########################################################################################## if data_period == 'validation': daily_return_df_list = prepare_data.daily_return_df_list_val elif data_period == 'test': daily_return_df_list = prepare_data.daily_return_df_list_test for index_no, daily_return_df in enumerate(tqdm(daily_return_df_list)): cluster_weight_dict = {} # weights of each 11 cluster cluster_ticker_dict = {} # tickers belonging to each cluster stock_weight_within_cluster_dict = {} # weights of each individual stock ################## 1. normalizing data ################## if scaling_method == 'standard_scale': scaled_daily_price_array = (standard_scaler.fit_transform(daily_return_df)) after_scaling_return_df = pd.DataFrame(scaled_daily_price_array, columns = all_ticker_list).T elif scaling_method == 'none': after_scaling_return_df = daily_return_df.T ################## 2. dimensionality reduction ################## if dim_reduction_method == 'PCA': pca = PCA(n_components=no_of_PCA_components) scaled_daily_return_PCA_array = pca.fit_transform(after_scaling_return_df) after_dim_reduction_return_df = pd.DataFrame(scaled_daily_return_PCA_array, index = all_ticker_list) # shape : [stocks * PCs] elif dim_reduction_method == 'tsne': tsne = TSNE(n_components = no_of_tsne_components) scaled_daily_return_tsne_cudf = tsne.fit_transform(after_scaling_return_df.values) after_dim_reduction_return_df = pd.DataFrame(scaled_daily_return_tsne_cudf, index = all_ticker_list) elif dim_reduction_method == 'none': after_dim_reduction_return_df = after_scaling_return_df ################## 3. bounded k-means clustering ################## n_clusters = 11 n_iter = 30 n_init = 15 weights = np.ones(after_dim_reduction_return_df.shape[0]) # The original code is created to consider the observations' weights, which is not needed in our research. # As we judge stocks solely based on return movements while coming up with a portfolio, we assign 1 to every stock. cluster_maker = bounded.BoundedKMeansClustering(n_clusters, max_cluster_size, n_iter, n_init) best_cost, best_clusters = cluster_maker.fit(after_dim_reduction_return_df.values, weights) after_dim_reduction_return_df.loc[:,'cluster_label'] = 0 # assign false classification 0 at first, but correctly classifies stocks right after. for cluster_label in range(n_clusters): ticker_index_list = best_clusters[cluster_label] cluster_ticker_list = list(np.array(prepare_data.all_ticker_list)[ticker_index_list]) after_dim_reduction_return_df.loc[cluster_ticker_list,'cluster_label'] = cluster_label # storing clustering results in a dict : cluster_ticker_dict cluster_label_list = list((after_dim_reduction_return_df.loc[:,'cluster_label'].unique())) for cluster_label in cluster_label_list: cluster_ticker_dict[cluster_label] = list(after_dim_reduction_return_df[after_dim_reduction_return_df.loc[:,'cluster_label'] == cluster_label].index) ################## (Optional) For visualization ################## pca = PCA(n_components=2) daily_return_array_for_viz = pca.fit_transform(after_dim_reduction_return_df.iloc[:,:-1]) daily_return_df_for_viz = pd.DataFrame(daily_return_array_for_viz, index = prepare_data.all_ticker_list) # shape : [stocks * PCs] daily_return_df_for_viz.loc[: ,'cluster_label'] = after_dim_reduction_return_df.loc[:,'cluster_label'] daily_return_df_for_viz.rename(columns = {0: 'PC_1',1:'PC_2'}, inplace=True) # --------------------- # # GMVP within cluster # # --------------------- # # GMVP on each cluster using the function 'GMVP_within_cluster' --> to create a matrix of daily returns of clusters stock_weight_within_cluster_dict = {} daily_cluster_return_dict = {} ################## 4. computing stock weights within a cluster ################## for cluster_label in cluster_label_list: daily_cluster_return_series, stock_weight_within_cluster_array = GMVP_within_cluster(data_period, index_no, cluster_ticker_dict[cluster_label]) daily_cluster_return_dict[cluster_label] = daily_cluster_return_series stock_weight_within_cluster_dict[cluster_label] = stock_weight_within_cluster_array daily_cluster_return_df = pd.DataFrame.from_dict(daily_cluster_return_dict) # ---------------------- # # GMVP between clusters # # ---------------------- # cov_mat_of_cluster_df = daily_cluster_return_df.cov() cov_mat_of_cluster_array = cov_mat_of_cluster_df.values inv_cov_mat_array = np.linalg.pinv(cov_mat_of_cluster_array) # Use pseudo-inverse incase matrix is singular / ill-conditioned ################## 5. computing cluster weights ################## one_vector_array = np.ones(len(inv_cov_mat_array)) inv_dot_one_array = np.dot(inv_cov_mat_array, one_vector_array) cluster_weight_array = inv_dot_one_array/ np.dot( inv_dot_one_array , one_vector_array) cluster_weight_df = pd.DataFrame(data= cluster_weight_array, columns = ['weight'], index = cluster_label_list) # compute stdev of portfolio, which can be calculated from daily returns of clusters in_sample_variance = np.dot(cluster_weight_array, np.dot(cov_mat_of_cluster_array, cluster_weight_array)) in_sample_stdev = np.sqrt(in_sample_variance) ################## 6. computing stock weights in a portfolio (portfolio weights) ################## temp_portfolio_weight_list = [] for cluster_label in cluster_label_list: stock_weight_within_sector_array = cluster_weight_df.loc[cluster_label,'weight'] * stock_weight_within_cluster_dict[cluster_label] stock_weight_within_sector_df = pd.DataFrame(stock_weight_within_sector_array, index = cluster_ticker_dict[cluster_label], columns = ['weight']) temp_portfolio_weight_list.append(stock_weight_within_sector_df) portfolio_weight_df = pd.concat(temp_portfolio_weight_list) # ---------------------------------------------------------- # # Calculating daily return based on GMV optimization results # # ---------------------------------------------------------- # # we should use 'index+1' ; we optimize portfolio at the time point 'index' and see how it goes for the time period from 'index' to 'index+1' if (index_no+1) < len(daily_return_df_list): # appending to the list only if we invest in the market, following the portfolio optimization in_sample_stdev_list.append(in_sample_stdev) # appending data only if the data is used for calculating next time's return future_daily_return_df = daily_return_df_list[index_no+1] future_daily_cluster_return_dict = {} for cluster_label in cluster_label_list: # 1st optimization - cluster_weight calculated from 'between_cluster GMV' cluster_weight = cluster_weight_df.loc[cluster_label,'weight'] # 2nd optimization - stock_weight calculated from 'inside_cluster GMV' stock_weight_within_cluster_array = stock_weight_within_cluster_dict[cluster_label] # based on the asset allocation reuslt from 1st optimization and 2nd optimization, now we can compute the each stock's weight future_daily_cluster_return_df = (future_daily_return_df.loc[:, cluster_ticker_dict[cluster_label]] * stock_weight_within_cluster_array * cluster_weight) future_daily_cluster_return_series = future_daily_cluster_return_df.sum(axis=1)[-rebalancing_period:] future_daily_cluster_return_dict[cluster_label] = future_daily_cluster_return_series # out-of-sample daily portfolio returns (60 days) future_daily_cluster_return_df = pd.DataFrame.from_dict(future_daily_cluster_return_dict) daily_portfolio_return_series_60days = future_daily_cluster_return_df.sum(axis=1) daily_portfolio_return_list.extend(daily_portfolio_return_series_60days) # standard deviation of out-of-sample portfolio returns out_of_sample_stdev = daily_portfolio_return_series_60days.std() out_of_sample_stdev_list.append(out_of_sample_stdev) # saving outcomes to a dictionary cluster_and_stock_dict[index_no] = {'cluster_return' : daily_cluster_return_df, 'cluster_weight' : cluster_weight_df, 'cluster_ticker' : cluster_ticker_dict, 'stock_weight' : portfolio_weight_df, 'daily_return_for_viz' : daily_return_df_for_viz } daily_portfolio_return_array = np.array(daily_portfolio_return_list) in_sample_stdev_series = pd.Series(in_sample_stdev_list) out_of_sample_stdev_series = pd.Series(out_of_sample_stdev_list) return daily_portfolio_return_array, in_sample_stdev_series, out_of_sample_stdev_series, cluster_and_stock_dict
def get_tsne(X): X_reshape = X.reshape(len(X), -1) tsne = TSNE(n_components=2, init='pca', n_iter=2500, random_state=23) X_reshape_2D = tsne.fit_transform(X_reshape) return X_reshape_2D