Ejemplo n.º 1
0
def test_tsne_knn_parameters(dataset, type_knn_graph, method):

    X = dataset.data

    from sklearn.preprocessing import normalize

    X = normalize(X, norm='l1')

    neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)

    tsne = TSNE(n_components=2,
                random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    embed = tsne.fit_transform(X, True, knn_graph)
    validate_embedding(X, embed)

    embed = tsne.fit_transform(X, True, knn_graph.tocoo())
    validate_embedding(X, embed)

    embed = tsne.fit_transform(X, True, knn_graph.tocsc())
    validate_embedding(X, embed)
Ejemplo n.º 2
0
def get_features(df):
    features_df = pd.DataFrame()

    sparse_matrix, index_values, columns_values = get_sparse_matrix(
        df.iloc[:5_000_000],
        index='content_id',
        columns='user_id',
        values='simple_count')

    content_matrix = sparse_matrix.toarray()
    tsne = TSNE(n_components=N_COMP, random_state=0)
    tsne_array = tsne.fit_transform(content_matrix)

    tsne_df = pd.DataFrame({'content_id': index_values})

    for i in range(N_COMP):
        tsne_df[f'content_id_tsne_{i}'] = tsne_array[:, i]
        le = dict(tsne_df[['content_id', f'content_id_tsne_{i}']].values)
        features_df[f'content_id_tsne_{i}'] = df['content_id'].map(le)

        dh.save(f'../data/processed/dropped___tsne_encoder_{i}.pkl', le)

    features_df.columns = [f'dropped___{col}' for col in features_df.columns]

    return features_df
Ejemplo n.º 3
0
def test_tsne_fit_transform_on_digits_sparse(input_type, method):

    digits = test_datasets['digits'].data

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    fitter = TSNE(n_components=2,
                  random_state=1,
                  method=method,
                  min_grad_norm=1e-12,
                  n_neighbors=DEFAULT_N_NEIGHBORS,
                  learning_rate_method="none",
                  perplexity=DEFAULT_PERPLEXITY)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(digits)).astype('float32')

    embedding = fitter.fit_transform(new_data, convert_dtype=True)

    if input_type == 'cupy':
        embedding = embedding.get()

    trust = trustworthiness(digits, embedding, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert trust >= 0.85
Ejemplo n.º 4
0
def test_tsne_transform_on_digits_sparse(input_type):

    datasets
    digits = datasets.load_digits()

    digits_selection = np.random.RandomState(42).choice(
        [True, False], 1797, replace=True, p=[0.60, 0.40])

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    fitter = TSNE(2, n_neighbors=15,
                  random_state=1,
                  learning_rate=500,
                  angle=0.8)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(digits.data[~digits_selection]))

    embedding = fitter.fit_transform(new_data, convert_dtype=True)

    if input_type == 'cupy':
        embedding = embedding.get()

    trust = trustworthiness(digits.data[~digits_selection], embedding, 15)
    assert trust >= 0.85
Ejemplo n.º 5
0
 def apply_tsne(j):
     idx, md5, x = j
     if pca_preprocessing:
         x = PCA(n_components=None,
                 random_state=random_state).fit_transform(x)
     tsne = TSNE(**kwargs)
     return (idx, md5, tsne.fit_transform(x),
             tsne if return_model else None)
Ejemplo n.º 6
0
def test_tsne_default(name):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    for i in range(3):
        print("iteration = ", i)

        tsne = TSNE()
        Y = tsne.fit_transform(X)
        check_embedding(X, Y)
        del Y
Ejemplo n.º 7
0
def test_tsne_large(nrows, ncols):
    """
    This tests how TSNE handles large input
    """
    X, y = make_blobs(n_samples=nrows, centers=8,
                      n_features=ncols, random_state=0)

    X = X.astype(np.float32)

    tsne = TSNE(random_state=0, exaggeration_iter=1, n_iter=2)
    Y = tsne.fit_transform(X)
    nans = np.sum(np.isnan(Y))
    assert nans == 0
Ejemplo n.º 8
0
def compute_tsne(data, overwrite_cache=False):
    start = time.perf_counter()

    tsne_file = f"citation_graph_deepwalk_tsne.pkl"
    if os.path.exists(tsne_file) and not overwrite_cache:
        data_embedded = pickle.load(open(tsne_file, 'rb'))
    else:
        visualizer = TSNE(perplexity=30, n_neighbors=90)
        data_embedded = visualizer.fit_transform(data)
        pickle.dump(data_embedded, open(tsne_file, 'wb'))
    end = time.perf_counter()
    print(f"Computing t-SNE took {end-start} seconds.")
    return data_embedded
Ejemplo n.º 9
0
def test_tsne_default(name):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    for i in range(3):
        print("iteration = ", i)

        tsne = TSNE()
        Y = tsne.fit_transform(X)
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y
Ejemplo n.º 10
0
def test_tsne(name):
    """
    This tests how TSNE handles a lot of input data across time.
    (1) cuDF DataFrames are passed input
    (2) Numpy arrays are passed in
    (3) Params are changed in the TSNE class
    (4) The class gets re-used across time
    (5) Trustworthiness is checked
    (6) Tests NAN in TSNE output for learning rate explosions
    (7) Tests verbosity
    """
    datasets
    X = eval("datasets.load_{}".format(name))().data
    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X))

    for i in range(3):
        print("iteration = ", i)

        tsne = TSNE(2, random_state=i, verbose=0, learning_rate=2 + i)

        Y = tsne.fit_transform(X_cudf).to_pandas().values
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y

        # Reuse
        Y = tsne.fit_transform(X)
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y

        # Again
        tsne = TSNE(2, random_state=i + 2, verbose=1, learning_rate=2 + i + 2)

        Y = tsne.fit_transform(X_cudf).to_pandas().values
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y

        # Reuse
        Y = tsne.fit_transform(X)
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y
Ejemplo n.º 11
0
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type):

    datasets
    digits = datasets.load_digits()

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    digits_selection = np.random.RandomState(42).choice(
        [True, False], 1797, replace=True, p=[0.60, 0.40])

    selected_digits = digits.data[~digits_selection]

    neigh.fit(selected_digits)
    knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance")

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    tsne = TSNE(2, n_neighbors=15,
                random_state=1,
                learning_rate=500,
                angle=0.8)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(selected_digits))

    Y = tsne.fit_transform(new_data, True, knn_graph)
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocoo())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocsc())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)
    del Y
Ejemplo n.º 12
0
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type, method):

    digits = test_datasets["digits"].data

    neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS,
                  metric="euclidean").fit(digits)
    knn_graph = neigh.kneighbors_graph(digits,
                                       mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    tsne = TSNE(n_components=2,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                random_state=1,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    new_data = sp_prefix.csr_matrix(scipy.sparse.csr_matrix(digits))

    Y = tsne.fit_transform(new_data, True, knn_graph)
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocoo())
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocsc())
    if input_type == 'cupy':
        Y = Y.get()
    validate_embedding(digits, Y, 0.85)
Ejemplo n.º 13
0
def test_tsne(name):
    """
    This tests how TSNE handles a lot of input data across time.
    (1) Numpy arrays are passed in
    (2) Params are changed in the TSNE class
    (3) The class gets re-used across time
    (4) Trustworthiness is checked
    (5) Tests NAN in TSNE output for learning rate explosions
    (6) Tests verbosity
    """
    datasets
    X = eval("datasets.load_{}".format(name))().data

    for i in range(3):
        print("iteration = ", i)

        tsne = TSNE(2, random_state=i, verbose=False,
                    learning_rate=2+i)

        # Reuse
        Y = tsne.fit_transform(X)
        check_embedding(X, Y)
        del Y

        # Again
        tsne = TSNE(2, random_state=i+2, verbose=logger.level_debug,
                    learning_rate=2+i+2)

        # Reuse
        Y = tsne.fit_transform(X)
        check_embedding(X, Y)
        del Y
Ejemplo n.º 14
0
def test_tsne_knn_graph_used(name, type_knn_graph):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    neigh.fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance")
    tsne = TSNE()

    # Perform tsne with normal knn_graph
    Y = tsne.fit_transform(X, True, knn_graph)
    trust_normal = trustworthiness(X, Y)
    print("Trust = ", trust_normal)

    X_garbage = np.ones(X.shape)
    knn_graph_garbage = neigh.kneighbors_graph(X_garbage, mode="distance")

    # Perform tsne with garbage knn_graph
    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage.tocoo())
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage.tocsc())
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15
Ejemplo n.º 15
0
def test_tsne_knn_graph_used(dataset, type_knn_graph, method):

    X = dataset.data

    neigh = cuKNN(n_neighbors=DEFAULT_N_NEIGHBORS, metric="euclidean").fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)

    tsne = TSNE(random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                method=method,
                perplexity=DEFAULT_PERPLEXITY,
                learning_rate_method='none',
                min_grad_norm=1e-12)

    # Perform tsne with normal knn_graph
    Y = tsne.fit_transform(X, True, knn_graph)

    trust_normal = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)

    X_garbage = np.ones(X.shape)
    knn_graph_garbage = neigh.kneighbors_graph(
        X_garbage, mode="distance").astype('float32')

    if type_knn_graph == 'cuml':
        knn_graph_garbage = cupyx.scipy.sparse.csr_matrix(knn_graph_garbage)

    tsne = TSNE(random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                method=method,
                perplexity=DEFAULT_PERPLEXITY,
                learning_rate_method='none',
                min_grad_norm=1e-12)

    # Perform tsne with garbage knn_graph
    Y = tsne.fit_transform(X, True, knn_graph_garbage)

    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y, n_neighbors=DEFAULT_N_NEIGHBORS)
    assert (trust_normal - trust_garbage) > 0.15
Ejemplo n.º 16
0
def test_tsne(dataset, method):
    """
    This tests how TSNE handles a lot of input data across time.
    (1) Numpy arrays are passed in
    (2) Params are changed in the TSNE class
    (3) The class gets re-used across time
    (4) Trustworthiness is checked
    (5) Tests NAN in TSNE output for learning rate explosions
    (6) Tests verbosity
    """
    X = dataset.data

    tsne = TSNE(n_components=2,
                random_state=1,
                n_neighbors=DEFAULT_N_NEIGHBORS,
                learning_rate_method='none',
                method=method,
                min_grad_norm=1e-12,
                perplexity=DEFAULT_PERPLEXITY)

    Y = tsne.fit_transform(X)
    validate_embedding(X, Y)
Ejemplo n.º 17
0
def test_tsne_knn_parameters(name, type_knn_graph):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    neigh.fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance")

    for i in range(3):
        print("iteration = ", i)
        tsne = TSNE()
        Y = tsne.fit_transform(X, True, knn_graph)
        check_embedding(X, Y)

        Y = tsne.fit_transform(X, True, knn_graph.tocoo())
        check_embedding(X, Y)

        Y = tsne.fit_transform(X, True, knn_graph.tocsc())
        check_embedding(X, Y)
        del Y
Ejemplo n.º 18
0
def test_tsne(name):
    """
    This tests how TSNE handles a lot of input data across time.
    (1) Numpy arrays are passed in
    (2) Params are changed in the TSNE class
    (3) The class gets re-used across time
    (4) Trustworthiness is checked
    (5) Tests NAN in TSNE output for learning rate explosions
    (6) Tests verbosity
    """
    datasets
    X = eval("datasets.load_{}".format(name))().data

    for i in range(3):
        print("iteration = ", i)

        tsne = TSNE(2, random_state=i, verbosity=logger.LEVEL_INFO,
                    learning_rate=2+i)

        # Reuse
        Y = tsne.fit_transform(X)
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y

        # Again
        tsne = TSNE(2, random_state=i+2, verbosity=logger.LEVEL_DEBUG,
                    learning_rate=2+i+2)

        # Reuse
        Y = tsne.fit_transform(X)
        nans = np.sum(np.isnan(Y))
        trust = trustworthiness(X, Y)
        print("Trust = ", trust)
        assert trust > 0.76
        assert nans == 0
        del Y
Ejemplo n.º 19
0
import numpy as np
from cuml.manifold import TSNE
from sklearn.linear_model import LinearRegression

from helpers import load_mnist

train_images, train_labels = load_mnist("./fashion", subset="train")
test_images, test_labels = load_mnist("./fashion", subset="test")

all_images = np.concatenate((train_images, test_images))

tsne = TSNE(
    n_components=2,
    method="barnes_hut",
    random_state=23,
    learning_rate=200,
    perplexity=50,
    n_iter=3000,
)

train_X_hat = tsne.fit_transform(train_images)
test_X_hat = tsne.fit_transform(test_images)

all_X_hat = tsne.fit_transform(all_images)

np.save("trained_data/train_tsne", train_X_hat)
np.save("trained_data/test_tsne", test_X_hat)
np.save("trained_data/all_images_tsne", all_X_hat)

#################################
# infer approximate embeddings for new images:
Ejemplo n.º 20
0
             norm_hist=True,
             bins=range(0, 100, 5))
sns.distplot(V2inV2,
             label="V2inV2",
             kde=False,
             norm_hist=True,
             bins=range(0, 100, 5))
plt.title(
    "Numbers of 100-nearest neighbor of the same type. \nEqual-ish retinotopy ($x>-10$) and equal number of neurons from V1 and V2"
)

plt.legend()
plt.show()
#
#%%
tsne = TSNE(n_components=2, perplexity=40)
X_hat = tsne.fit_transform(S_hat)

#%%
fig, ax = plt.subplots(figsize=(6, 6), dpi=300)

ax.scatter(X_hat[:cutoff, 0],
           X_hat[:cutoff, 1],
           c="C0",
           alpha=0.5,
           s=5,
           linewidth=0,
           label="V1")
ax.scatter(X_hat[cutoff:, 0],
           X_hat[cutoff:, 1],
           c="C1",
Ejemplo n.º 21
0
def test_components_exception():
    with pytest.raises(ValueError):
        TSNE(n_components=3)
Ejemplo n.º 22
0
 def apply_tsne(j):
     idx, md5, x = j
     tsne = TSNE(**kwargs)
     return (idx, md5, tsne.fit_transform(x),
             tsne if return_model else None)
Ejemplo n.º 23
0
samples = rnd_state.choice(len(train_ds), 5000, replace=False)
train_ds = Subset(train_ds, samples)

# Extract embedding vectors
load_kwargs = {'batch_size': 128, 'num_workers': os.cpu_count()}

# test_embs, _ = extract_embeddings(emb_net, DataLoader(test_ds, **load_kwargs))
embs, labels = extract_embeddings(emb_net, DataLoader(train_ds, **load_kwargs))

# translate them to cpu + numpy
embs = embs.cpu().numpy()
labels = labels.cpu().numpy()
# -----------------------------------------------------------------------------
print("Plotting T-sne....")
from cuml.manifold import TSNE
tsne = TSNE(n_iter=1000, metric="euclidean")
projected_emb = tsne.fit_transform(embs)
fig = plot_embeddings(projected_emb, labels)
png_fname = join(exp_folder, 't-sne.png')
fig.savefig(png_fname, bbox_inches='tight')
pdf_fname = join(exp_folder, 't-sne.pdf')
fig.savefig(pdf_fname, bbox_inches='tight')
# -----------------------------------------------------------------------------
print("Plotting PCA....")
from cuml import PCA
pca_float = PCA(n_components=2)
cudf = pca_float.fit_transform(embs)
projected_emb = cudf.to_pandas().to_numpy()
fig = plot_embeddings(projected_emb, labels)
png_fname = join(exp_folder, 'pca.png')
fig.savefig(png_fname, bbox_inches='tight')
Ejemplo n.º 24
0
# Step 3
model = SiameseNet(embedding_net)

margin = 1.
loss_fn = ContrastiveLoss(margin)

lr = 1e-3
if has_cuda:
    model.cuda()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.StepLR(optimizer,
                                step_size=5,
                                gamma=0.1,
                                last_epoch=-1)
n_epochs = 20
log_interval = 50

fit(siamese_train_loader, siamese_test_loader, model, loss_fn, optimizer,
    scheduler, n_epochs, has_cuda, log_interval)
# ---------------------------------------------------------------------------
# Obtain the embeddings
embeddings, labels = extract_embeddings(embedding_net, test_loader)

tsne = TSNE(random_state=1, n_iter=1000, metric="euclidean")

projected_emb = tsne.fit_transform(embeddings)

fig = plot_embeddings(projected_emb, labels)

fig.savefig('siamese.png', bbox_inches='tight')
Ejemplo n.º 25
0
'''
To run the TSNE from cuml you need to create a virtual environment according to your PC configuration.
visit the URL (https://rapids.ai/start.html#rapids-release-selector)
e.g.

conda create -n rapids-0.18 -c rapidsai-nightly -c nvidia -c conda-forge \
    -c defaults blazingsql=0.18 cuml=0.18 python=3.7 cudatoolkit=11.0
'''

import numpy as np
import matplotlib.pyplot as plt
from cuml.manifold import TSNE
from glob import glob

files = glob('../feature_maps/cat_1/*.npy')

for file, i in zip(files, range(len(files))):
    print('[INFO] Processing file {}/{} . . .'.format(i + 1, len(files)))
    with open(file, 'rb') as f:
        x = np.load(f)

    x = np.transpose(x)

    tsne = TSNE(n_components=2)
    X_hat = tsne.fit_transform(x)

    plt.scatter(X_hat[:, 0], X_hat[:, 1])

plt.show()
Ejemplo n.º 26
0
from cuml.manifold import TSNE
import pickle


def load_mnist_train(path):
    """Load MNIST data from path"""
    labels_path = os.path.join(path, 'train-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, 'train-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)
    return images, labels


images, labels = load_mnist_train("data/fashion")

tsne = TSNE(n_components=2, method='barnes_hut', random_state=23)
embedding = tsne.fit_transform(images)

print(embedding[:10], embedding.shape, type(embedding))

outdir = '/opt/dkube/output/'
if not os.path.exists(outdir + 'model'):
    os.makedirs(outdir + 'model')

with open(outdir + 'model/embeddings.pickle', 'wb') as handle:
    pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 27
0
    help='Angular size. This is the trade-off between speed and accuracy.')
parser.add_argument('--min-grad-norm',
                    type=float,
                    default=1e-7,
                    help='If the gradient norm is below this threshold,'
                    'the optimization is stopped.')
parser.add_argument('--random-state', type=int, default=1234)
params = bench.parse_args(parser)

# Load and convert data
X, _, _, _ = bench.load_data(params)

# Create our random forest regressor
tsne = TSNE(n_components=params.n_components,
            early_exaggeration=params.early_exaggeration,
            learning_rate=params.learning_rate,
            angle=params.angle,
            min_grad_norm=params.min_grad_norm,
            random_state=params.random_state)

fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
# Need to investigate how to compare sklearn and cuml metrics for tsne

bench.print_output(library='cuml',
                   algorithm='tsne',
                   stages=['training'],
                   params=params,
                   functions=['tsne.fit'],
                   times=[fit_time],
                   metric_type=None,
                   metrics=None,
                   data=[X],
Ejemplo n.º 28
0
def GMVP_between_clusters(data_period, max_cluster_size, scaling_method='none', dim_reduction_method='none', no_of_PCA_components = 0, no_of_tsne_components = 0):
    rebalancing_period = 60
    
    ################## creating lists and dictionaries for storing outcomes ##################
    # 1) returns
    daily_portfolio_return_list = []   # daily returns of portfolio
    # 2) stdev
    in_sample_stdev_list = []          # standard deviation of in-sample (252-day-long) daily portfolio returns
    out_of_sample_stdev_list = []      # standard deviation of out-of-sample (60-day-long) daily portfolio returns
    # 3) cluster & stock 
    cluster_and_stock_dict = {}        # 1. cluster_return : 252_daily_returns * 11_clusters
                                       # 2. cluster_weight : weights of each 11 cluster
                                       # 3. cluster_ticker : tickers belonging to each cluster 
                                       # 4. stock_weight   : weights of each individual stock
                                       # 5. daily_return_for_viz 
    ##########################################################################################
    
    if data_period == 'validation':
        daily_return_df_list = prepare_data.daily_return_df_list_val
    elif data_period == 'test':
        daily_return_df_list = prepare_data.daily_return_df_list_test
    
    for index_no, daily_return_df in enumerate(tqdm(daily_return_df_list)):
        
        cluster_weight_dict = {}   # weights of each 11 cluster
        cluster_ticker_dict = {}   # tickers belonging to each cluster 
        stock_weight_within_cluster_dict = {} # weights of each individual stock
        
        ################## 1. normalizing data ##################
        if scaling_method == 'standard_scale':    
            scaled_daily_price_array = (standard_scaler.fit_transform(daily_return_df))
            after_scaling_return_df = pd.DataFrame(scaled_daily_price_array, columns = all_ticker_list).T
        elif scaling_method == 'none':
            after_scaling_return_df = daily_return_df.T
        
        ################## 2. dimensionality reduction ##################
        if dim_reduction_method == 'PCA':
            pca = PCA(n_components=no_of_PCA_components)
            scaled_daily_return_PCA_array = pca.fit_transform(after_scaling_return_df)
            after_dim_reduction_return_df = pd.DataFrame(scaled_daily_return_PCA_array, index = all_ticker_list)  # shape : [stocks * PCs]
        elif dim_reduction_method == 'tsne':
            tsne = TSNE(n_components = no_of_tsne_components)
            scaled_daily_return_tsne_cudf = tsne.fit_transform(after_scaling_return_df.values)
            after_dim_reduction_return_df = pd.DataFrame(scaled_daily_return_tsne_cudf, index = all_ticker_list)
        elif dim_reduction_method == 'none':
            after_dim_reduction_return_df = after_scaling_return_df
            
        ################## 3. bounded k-means clustering  ################## 
        n_clusters = 11
        n_iter = 30
        n_init = 15

        weights = np.ones(after_dim_reduction_return_df.shape[0]) # The original code is created to consider the observations' weights, which is not needed in our research.
                                                                  # As we judge stocks solely based on return movements while coming up with a portfolio, we assign 1 to every stock.

        cluster_maker = bounded.BoundedKMeansClustering(n_clusters, max_cluster_size, n_iter, n_init)
        best_cost, best_clusters = cluster_maker.fit(after_dim_reduction_return_df.values, weights)
        after_dim_reduction_return_df.loc[:,'cluster_label'] = 0  # assign false classification 0 at first, but correctly classifies stocks right after. 
        
        for cluster_label in range(n_clusters):
            ticker_index_list =  best_clusters[cluster_label]
            cluster_ticker_list = list(np.array(prepare_data.all_ticker_list)[ticker_index_list])
            after_dim_reduction_return_df.loc[cluster_ticker_list,'cluster_label'] = cluster_label
            
        # storing clustering results in a dict : cluster_ticker_dict
        cluster_label_list = list((after_dim_reduction_return_df.loc[:,'cluster_label'].unique()))
        for cluster_label in cluster_label_list:
            cluster_ticker_dict[cluster_label] = list(after_dim_reduction_return_df[after_dim_reduction_return_df.loc[:,'cluster_label'] == cluster_label].index)
        
        ################## (Optional) For visualization ##################
        pca = PCA(n_components=2)
        daily_return_array_for_viz = pca.fit_transform(after_dim_reduction_return_df.iloc[:,:-1])
        daily_return_df_for_viz = pd.DataFrame(daily_return_array_for_viz, index = prepare_data.all_ticker_list)  # shape : [stocks * PCs]
        daily_return_df_for_viz.loc[: ,'cluster_label'] = after_dim_reduction_return_df.loc[:,'cluster_label']
        daily_return_df_for_viz.rename(columns = {0: 'PC_1',1:'PC_2'}, inplace=True)
        
        # --------------------- #
        #  GMVP within cluster  #
        # --------------------- #
        # GMVP on each cluster using the function 'GMVP_within_cluster' --> to create a matrix of daily returns of clusters
        stock_weight_within_cluster_dict = {}
        daily_cluster_return_dict = {}
        
        ################## 4. computing stock weights within a cluster ##################
        for cluster_label in cluster_label_list:
            daily_cluster_return_series, stock_weight_within_cluster_array = GMVP_within_cluster(data_period, index_no, cluster_ticker_dict[cluster_label])
            daily_cluster_return_dict[cluster_label] = daily_cluster_return_series
            stock_weight_within_cluster_dict[cluster_label] = stock_weight_within_cluster_array

        daily_cluster_return_df = pd.DataFrame.from_dict(daily_cluster_return_dict)
        
        # ---------------------- #
        #  GMVP between clusters #
        # ---------------------- #
        cov_mat_of_cluster_df = daily_cluster_return_df.cov()
        cov_mat_of_cluster_array = cov_mat_of_cluster_df.values
        inv_cov_mat_array = np.linalg.pinv(cov_mat_of_cluster_array) # Use pseudo-inverse incase matrix is singular / ill-conditioned

        ################## 5. computing cluster weights ##################
        one_vector_array = np.ones(len(inv_cov_mat_array))
        inv_dot_one_array = np.dot(inv_cov_mat_array, one_vector_array)
        cluster_weight_array = inv_dot_one_array/ np.dot( inv_dot_one_array , one_vector_array)
        cluster_weight_df = pd.DataFrame(data= cluster_weight_array, columns = ['weight'], index = cluster_label_list)
        
        # compute stdev of portfolio, which can be calculated from daily returns of clusters
        in_sample_variance = np.dot(cluster_weight_array, np.dot(cov_mat_of_cluster_array, cluster_weight_array))
        in_sample_stdev = np.sqrt(in_sample_variance)
        
        ################## 6. computing stock weights in a portfolio (portfolio weights) ##################
        temp_portfolio_weight_list = []

        for cluster_label in cluster_label_list:
            stock_weight_within_sector_array =  cluster_weight_df.loc[cluster_label,'weight'] * stock_weight_within_cluster_dict[cluster_label]
            stock_weight_within_sector_df = pd.DataFrame(stock_weight_within_sector_array, index = cluster_ticker_dict[cluster_label], columns = ['weight'])
            temp_portfolio_weight_list.append(stock_weight_within_sector_df)

        portfolio_weight_df = pd.concat(temp_portfolio_weight_list)
        
        # ---------------------------------------------------------- #
        # Calculating daily return based on GMV optimization results #
        # ---------------------------------------------------------- #
        # we should use 'index+1' ; we optimize portfolio at the time point 'index' and see how it goes for the time period from 'index' to 'index+1'
        if (index_no+1) < len(daily_return_df_list):
            
            # appending to the list only if we invest in the market, following the portfolio optimization
            in_sample_stdev_list.append(in_sample_stdev)
            
            # appending data only if the data is used for calculating next time's return 
            future_daily_return_df = daily_return_df_list[index_no+1]
            future_daily_cluster_return_dict = {}

            for cluster_label in cluster_label_list:

                # 1st optimization - cluster_weight calculated from 'between_cluster GMV'
                cluster_weight = cluster_weight_df.loc[cluster_label,'weight']
                # 2nd optimization - stock_weight calculated from 'inside_cluster GMV'
                stock_weight_within_cluster_array = stock_weight_within_cluster_dict[cluster_label]
                # based on the asset allocation reuslt from 1st optimization and 2nd optimization, now we can compute the each stock's weight
                future_daily_cluster_return_df = (future_daily_return_df.loc[:, cluster_ticker_dict[cluster_label]] * stock_weight_within_cluster_array * cluster_weight)
                future_daily_cluster_return_series = future_daily_cluster_return_df.sum(axis=1)[-rebalancing_period:]
                future_daily_cluster_return_dict[cluster_label] = future_daily_cluster_return_series

            # out-of-sample daily portfolio returns (60 days)
            future_daily_cluster_return_df = pd.DataFrame.from_dict(future_daily_cluster_return_dict)
            daily_portfolio_return_series_60days = future_daily_cluster_return_df.sum(axis=1)
            daily_portfolio_return_list.extend(daily_portfolio_return_series_60days)
            
            # standard deviation of out-of-sample portfolio returns
            out_of_sample_stdev = daily_portfolio_return_series_60days.std()
            out_of_sample_stdev_list.append(out_of_sample_stdev)
            
            # saving outcomes to a dictionary
            cluster_and_stock_dict[index_no] = {'cluster_return' : daily_cluster_return_df,
                                                'cluster_weight' : cluster_weight_df,
                                                'cluster_ticker' : cluster_ticker_dict,
                                                'stock_weight' : portfolio_weight_df,
                                                'daily_return_for_viz' : daily_return_df_for_viz
                                                }
            
    daily_portfolio_return_array = np.array(daily_portfolio_return_list)
    in_sample_stdev_series = pd.Series(in_sample_stdev_list)
    out_of_sample_stdev_series = pd.Series(out_of_sample_stdev_list)
    
    return daily_portfolio_return_array, in_sample_stdev_series, out_of_sample_stdev_series, cluster_and_stock_dict
Ejemplo n.º 29
0
def get_tsne(X):
    X_reshape = X.reshape(len(X), -1)
    tsne = TSNE(n_components=2, init='pca', n_iter=2500, random_state=23)
    X_reshape_2D = tsne.fit_transform(X_reshape)
    return X_reshape_2D