Exemple #1
0
def pcAnalysis(X, Xtest, w=None, ncomp=2, useTSNE=False):
    """
    PCA(TSNE
    """
    if useTSNE:
        print "TSNE analysis for train/test"
        pca = TSNE(n_components=ncomp)
    else:
        print "PC analysis for train/test"
        pca = TruncatedSVD(n_components=ncomp)
    print pca

    pca.fit(X)
    X_all = pd.concat([Xtest, X])
    X_r = pca.transform(X_all.values)
    plt.scatter(X_r[len(Xtest.index):, 0],
                X_r[len(Xtest.index):, 1],
                c='r',
                label="train",
                alpha=0.5)
    plt.scatter(X_r[:len(Xtest.index), 0],
                X_r[:len(Xtest.index), 1],
                c='g',
                label="test",
                alpha=0.5)
    print("Total variance:", np.sum(pca.explained_variance_ratio_))
    print("Explained variance:", pca.explained_variance_ratio_)
    plt.legend()
    plt.show()
Exemple #2
0
def process_data(train_df,
                 test_df,
                 ylabel='target',
                 standarization=False,
                 discretization=False,
                 transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)

    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df,
                           test_df,
                           discretized_features,
                           num_bins=10,
                           how='equal_freq')

    X = train_df.drop(ylabel, axis=1).as_matrix()
    y = train_df[ylabel].as_matrix()
    X_submission = test_df.as_matrix()

    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))
    return X, y, X_submission
Exemple #3
0
def get_twenty_dataset(remove_stop_word=False, preprocessing_trick=None, n_components=2):
	twenty_train = fetch_20newsgroups(subset='train', \
		remove=['headers', 'footers', 'quote'], shuffle=True)
	twenty_test = fetch_20newsgroups(subset='test', \
		remove=['headers', 'footers', 'quote'], shuffle=True)

	if remove_stop_word:
		count_vect = CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))
	else:
		count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(twenty_train.data)
	X_test_counts = count_vect.transform(twenty_test.data)

	_, vocab_size = X_train_counts.shape

	tfidf_transformer = TfidfTransformer()
	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
	X_test_tfidf = tfidf_transformer.transform(X_test_counts)

	X_train, X_test = X_train_tfidf, X_test_tfidf

	if preprocessing_trick == 'SVD':
		pca = TruncatedSVD(n_components = n_components) 
		X_train = pca.fit_transform(X_train)
		X_test = pca.transform(X_test)
	elif preprocessing_trick == 'LDA':
		lda = LinearDiscriminantAnalysis()
		X_train = lda.fit_transform(X_train.toarray(), twenty_train.target)
		X_test = lda.transform(X_test.toarray())
	elif preprocessing_trick == 'TSNE':
		tsne = TSNE(n_components=n_components)
		X_train = tsne.fit_transform(X_train.toarray())
		X_test = tsne.transform(X_test.toarray())
	elif preprocessing_trick == 'autoencoder':
		#n_components = 256
		num_samples, feature_dim = X_train.shape
		print('autoencoder: ',num_samples, feature_dim,n_components)
		input_sample = Input(shape=(feature_dim,))
		encoded = Dense(1024, activation='relu')(input_sample)
		encoded = Dense(512, activation='relu')(encoded)
		encoded = Dense(256, activation='relu')(encoded)	

		decoded = Dense(512, activation='relu')(encoded)
		decoded = Dense(1024, activation='relu')(decoded)
		decoded = Dense(feature_dim, activation='sigmoid')(decoded)

		autoencoder = Model(input_sample, decoded)
		autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

		autoencoder.fit(X_train.todense(), X_train.todense(), epochs=50, batch_size=256, shuffle=True, validation_data=(X_test.todense(), X_test.todense()))
		X_train = autoencoder.predict(X_train.todense())
		X_test = autoencoder.predict(X_test.todense())


	# Calculate dimensions
	max_length = np.amax(X_train)
	embedding_dict = {'vocab_size': vocab_size, 'max_length': max_length}

	return X_train, twenty_train.target, X_test, twenty_test.target, embedding_dict
Exemple #4
0
def TSNE(X_train, y_train=None, X_test=None, n=100):
    from sklearn.manifold import TSNE
    mod = TSNE(n_components=n)
    X = mod.fit(X_train, y_train)
    test = mod.transform(X_train)
    if X_test is None:
        out = train
    else:
        test = pca.transform(X_test)
        out = train, test
    return out
def plot_data(args, seq, original_seq=None):
    if args.delta:
        plt.figure()
        dist = np.sum((seq[1:, ...] - seq[:-1, ...])**2, axis=1)**0.5
        plt.hist(dist)
        if args.save:
            plt.savefig(args.save, dpi=120)
        else:
            plt.show()
        return

    if args.pca:
        pca = PCA(n_components=args.pca)
        if original_seq is None:
            seq = pca.fit_transform(seq)
        else:
            original_seq = pca.fit_transform(original_seq)
            seq = pca.transform(seq)

    if args.tsne:
        tsne = TSNE(n_components=2, perplexity=30.0, n_iter=2000, verbose=2)
        if original_seq is None:
            seq = tsne.fit_transform(seq)
        else:
            tsne.fit(original_seq)
            seq = tsne.transform(seq)

    if seq.shape[1] == 2:
        plt.figure()
        x, y = zip(*seq[:, :])
        color_list = cm.get_cmap(name="viridis")
        if args.strip:
            n, m = tuple(args.strip)
            for i in range(0, seq.shape[0] - 1, m):
                plt.plot(x[i:(i + n)],
                         y[i:(i + n)],
                         '-',
                         color=color_list(i / (seq.shape[0] - 1)))
        else:
            for i in range(seq.shape[0] - 1):
                plt.plot(x[i:(i + 2)],
                         y[i:(i + 2)],
                         '.',
                         color=color_list(i / (seq.shape[0] - 1)))
        plt.axis('equal')
        if args.save:
            plt.savefig(args.save, dpi=120)
        else:
            plt.show()
    else:
        print("Cannot plot sequence: data is of size {}".format(seq.shape))
Exemple #6
0
def do_tsne(train_std, val_std=np.array([]), num_dim=2):
    '''
    DESCRIPTION: Perform tSNE dimensionality reduction on training and validation sets

    INPUT:
        |--- train_std: [array] 2D array of standardized train feature vectors for each training sample
        |--- val_std: [array] 2D array of validation feature vectors for each validation sample standardized using training metrics; 
        |--- nb_dim: [int] dimensions of final subspace
    OUTPUT:
        |--- tsne_train: [array] 2D array nb training samples x nb of final dimensions, stores principal components of training matrix
        |--- tsne_val: [array] 2D array nb validation samples x nb of final dimensions, projection of validation matrix onto training tSNE subspace
    '''

    tsne = TSNE(n_components=num_dim, random_state=0)
    tsne_train = tsne.fit_transform(train_std)
    if val_std.any(): tsne_val = tsne.transform(val_std)
    else: tsne_val = np.array([])

    return tsne_train, tsne_val
class TSNERepresentation(Representation):
    @staticmethod
    def default_config():
        default_config = Representation.default_config()

        # parameters
        default_config.parameters = Dict()
        default_config.parameters.perplexity = 30.0
        default_config.parameters.init = "random"
        default_config.parameters.random_state = None

        return default_config

    def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs):
        Representation.__init__(self, config=config, **kwargs)

        # input size (flatten)
        self.n_features = n_features
        # latent size
        self.n_latents = n_latents
        # feature range
        self.feature_range = (0.0, 1.0)

        self.algorithm = TSNE(n_components=self.n_latents)
        self.update_algorithm_parameters()

    def fit(self, X_train, update_range=True):
        ''' 
        X_train: array-like (n_samples, n_features)
        '''
        X_train = np.nan_to_num(X_train)
        if update_range:
            self.feature_range = (X_train.min(axis=0), X_train.max(axis=0))  # save (min, max) for normalization
        X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        self.algorithm.fit(X_train)

    def calc_embedding(self, x):
        x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        x = self.algorithm.transform(x)
        return x

    def update_algorithm_parameters(self):
        self.algorithm.set_params(**self.config.parameters, verbose=False)
Exemple #8
0
def get_IMDB_dataset(remove_stop_word=False, preprocessing_trick=None, n_components=2):
	with open('./dataset/IMDB.pickle', 'rb') as data:
		dataset = pickle.load(data)
	train_x_raw, train_y = dataset['train']
	test_x_raw, test_y = dataset['test']
	
	if remove_stop_word:
		count_vect = CountVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))
	else:
		count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(train_x_raw)
	X_test_counts = count_vect.transform(test_x_raw)

	_, vocab_size = X_train_counts.shape

	tfidf_transformer = TfidfTransformer()
	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
	X_test_tfidf = tfidf_transformer.transform(X_test_counts)

	X_train, X_test = X_train_tfidf, X_test_tfidf

	if preprocessing_trick == 'PCA':
		pca = TruncatedSVD(n_components = n_components) 
		X_train = pca.fit_transform(X_train)
		X_test = pca.transform(X_test)
	elif preprocessing_trick == 'LDA':
		lda = LinearDiscriminantAnalysis()
		X_train = lda.fit_transform(X_train.toarray(), train_y)
		X_test = lda.transform(X_test.toarray())
	elif preprocessing_trick == 'TSNE':
		tsne = TSNE(n_components=n_components)
		X_train = tsne.fit_transform(X_train.toarray())
		X_test = tsne.transform(X_test.toarray())

	# Calculate dimensions
	max_length = np.amax(X_train)
	embedding_dict = {'vocab_size': vocab_size, 'max_length': max_length}

	return X_train, train_y, X_test, test_y, embedding_dict
Exemple #9
0
class Visual(object):
    def __init__(self, mode='pca', dim=2, full=True, save=False):

        self._mode = mode
        self._model = None
        self._dim = dim
        self._data = None
        self._labels = None
        self._sizes = []
        self._counter = 0
        self._result = None
        self.size = 1  # size of dots
        self._full = full
        self._save = save

    @property
    def data(self):
        return self._data

    @data.setter
    def data(self, new_data):
        self._data = join_data(self._data, new_data, np.vstack)

    @property
    def labels(self):
        return self._labels

    @labels.setter
    def labels(self, new_labels):
        self._labels = join_data(self._labels, new_labels, np.hstack)
        self._sizes += [self.size] * len(new_labels)

    @timing
    def fit_data(self, reduce=None):
        if self._mode == 'pca':
            self._model = PCA(n_components=self._dim, random_state=opt.seed)
        if self._mode == 'tsne':
            self._model = TSNE(n_components=self._dim,
                               perplexity=15,
                               random_state=opt.seed)
        if self._full:
            self._result = self._model.fit_transform(self._data)
        else:
            self._model.fit(self._data[:reduce])
            self._result = self._model.transform(self._data)

    def plot(self, iter=0, show=True, gt_plot=0, prefix=''):
        if iter is not None:
            self._counter = iter
        plt.scatter(self._result[..., 0],
                    self._result[..., 1],
                    c=self._labels,
                    s=self._sizes,
                    alpha=0.5)
        plt.grid(True)
        if self._save:
            # plt.figure(figsize=(1))
            dir_check(join(opt.dataset_root, 'plots'))
            dir_check(join(opt.dataset_root, 'plots', opt.subaction))
            pose_segm = ['!pose_', ''][opt.pose_segm]
            name = ['iter%d' % self._counter, 'gt', 'time'][gt_plot]
            name += '_%s.png' % self._mode
            name = prefix + '%s_%s_' % (opt.subaction, opt.tr_type) + name
            # if opt.grid_search:
            weight = ['w%d_' % int(opt.time_weight), ''][opt.time_weight == 1]
            folder_name = '%s_%slr_%.1e_dim_%d_ep_%d' % \
                          (opt.prefix, pose_segm, opt.lr, opt.embed_dim, opt.epochs)
            folder_name = opt.prefix + weight + folder_name
            dir_check(
                join(opt.dataset_root, 'plots', opt.subaction, folder_name))
            plt.savefig(join(opt.dataset_root, 'plots', opt.subaction,
                             folder_name, name),
                        dpi=400)
            # else:
            #     plt.savefig(join(opt.dataset_root, 'plots', opt.subaction, name), dpi=400)
        if show:
            plt.show()

    def reset(self):
        plt.clf()
        self._counter += 1
        self._data = None
        self._labels = None
        self._sizes = []
        self.size = 1
#### T-Distributed Stochastic Neighbor Embedding ####

model = TSNE(learning_rate=100)
transformed = model.fit_transform(data2)
x = transformed[:,0]
y = transformed[:,1]
plt.scatter(x,y,c = color_list )
plt.xlabel('pelvic_radius')
plt.xlabel('degree_spondylolisthesis')
plt.show()

#### Principal Component Analysis ####
model = PCA()
model.fit(data3)
transformed = model.transform(data3)
print('Principle components: ',model.components_)

#### PCA variance ####

scaler = StandardScaler()
pca = PCA()
pipeline = make_pipeline(scaler,pca)
pipeline.fit(data3)

plt.bar(range(pca.n_components_), pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.show()

#### PCA ####
print(reconstruct_model.summary())

# %%

predict_model = K.models.Sequential()
for i in reconstruct_model.layers[6:]:
    predict_model.add(i)

predict_model.build(input_shape=(None, 14, 14, 1))
print(predict_model.summary())
# %%
from sklearn.manifold import TSNE

t_sne = TSNE(n_components=2)
t_sne.fit(tX_test)
pr_test = t_sne.transform(tX_test)
pr_clusters = t_sne.transform(k_means.cluster_centers_)

# %%

centroids = k_means.cluster_centers_.reshape(10, 14, 14, 1)
cent_img = predict_model.predict(centroids)

# %%
import numpy as np


def centroid_dist(sample):
    return np.array(
        [np.linalg.norm(sample - c) for c in k_means.cluster_centers_])
class Visual(object):
    def __init__(self,
                 mode='pca',
                 dim=2,
                 reduce=None,
                 save=False,
                 svg=False,
                 saved_dots=''):

        # mpl.rcParams['image.cmap'] = 'cool'
        self._mode = mode
        self._model = None
        self._dim = dim
        self._data = None
        self._labels = None
        self._sizes = []
        self._counter = 0
        self._result = None
        self.size = 1  # size of dots
        self.reduce = reduce
        self._save = save
        self.svg = svg
        self.saved_dots = saved_dots

    @property
    def data(self):
        return self._data

    @data.setter
    def data(self, new_data):
        self._data = join_data(self._data, new_data, np.vstack)

    @property
    def labels(self):
        return self._labels

    @labels.setter
    def labels(self, new_labels):
        self._labels = join_data(self._labels, new_labels, np.hstack)
        self._sizes += [self.size] * len(new_labels)

    @timing
    def fit_data(self):
        if self.saved_dots:
            self._result = np.loadtxt(self.saved_dots)
        else:
            if self._mode == 'pca':
                self._model = PCA(n_components=self._dim,
                                  random_state=opt.seed)
            if self._mode == 'tsne':
                self._model = TSNE(n_components=self._dim,
                                   perplexity=15,
                                   random_state=opt.seed)

            if self.reduce is None:
                self._result = self._model.fit_transform(self._data)
            else:
                fraction = int(self._data.shape[0] * self.reduce / 100)
                self._model.fit(self._data[:fraction])
                self._result = self._model.transform(self._data)

    def plot(self, iter=0, show=True, prefix=''):
        if iter is not None:
            self._counter = iter
        if 20 in self._labels:
            self._labels = np.array(self._labels)
            mask = self._labels == 20
            self._labels[mask] = 10
        plt.axis('off')

        plt.scatter(self._result[..., 0],
                    self._result[..., 1],
                    c=self._labels,
                    s=self._sizes,
                    alpha=1)
        plt.grid(True)
        if prefix == 'time_':
            plt.colorbar()
        if self._save:
            # plt.figure(figsize=(1))
            dir_check(join(opt.dataset_root, 'plots'))
            dir_check(join(opt.dataset_root, 'plots', opt.subaction))
            # name = ['iter%d_' % self._counter, 'gt_'][gt_plot]
            name = prefix + '%s_%s_' % (opt.subaction, opt.model_name)
            folder_name = opt.log_str
            dir_check(
                join(opt.dataset_root, 'plots', opt.subaction, folder_name))
            folder_name = join(opt.log_str, opt.vis_mode)
            dir_check(
                join(opt.dataset_root, 'plots', opt.subaction, folder_name))
            if self.svg:
                name += '_%s.svg' % self._mode
            else:
                name += '_%s.png' % self._mode
                # plt.savefig(join(opt.dataset_root, 'plots', opt.subaction,
                #                  folder_name, name), dpi=400)
            plt.savefig(join(opt.dataset_root, 'plots', opt.subaction,
                             folder_name, name),
                        transparent=True,
                        dpi=300)
            np.savetxt(
                join(opt.dataset_root, 'plots', opt.subaction, folder_name,
                     '%s.txt' % opt.vis_mode), self._result)
        if show:
            plt.show()

    def reset(self):
        plt.clf()
        self._counter += 1
        self._data = None
        self._labels = None
        self._sizes = []
        self.size = 1

    def color(self, labels, prefix, reset=False):
        plt.clf()
        self._labels = labels
        self.plot(show=False, prefix=prefix)
        if reset:
            self.reset()

    def fit(self, data, labels, prefix, reset=True):
        self._data = data
        self._labels = labels
        self._sizes += [self.size] * len(labels)
        self.fit_data()
        self.plot(show=False, prefix=prefix)
        if reset:
            self.reset()
Exemple #13
0
def tsne(X, n_components):
    model = TSNE(n_components=2, perplexity=40)
    model.fit(X)
    return model.transform(X)
Exemple #14
0
def train_delta(matrix):

    import numpy as np
    import tensorflow as tf
    import pandas as pd
    import matplotlib.pyplot as plt

    from sklearn.preprocessing import MinMaxScaler
    from tensorflow.keras import Model, Input
    from tensorflow.keras.layers import (Dense, Dropout, GRU, Flatten,
                                         GaussianNoise, concatenate)

    from tensorflow.keras.models import load_model

    from tensorflow.keras.callbacks import Callback
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.callbacks import ModelCheckpoint
    from tensorflow.keras.optimizers import Adam

    import supplemental_functions

    from supplemental_functions import (sampling_fix, prepareinput,
                                        prepareinput_nozero, prepareoutput)

    tf.keras.backend.clear_session()
    [
        newdim, percent_drilled, start, stop, inc_layer1, inc_layer2,
        data_layer1, data_layer2, dense_layer, range_max, memory, predictions,
        drop1, drop2, lr, bs, ensemble_count
    ] = matrix
    drop1 = drop1 / 100
    drop2 = drop2 / 100
    inc_layer2 = inc_layer2 / 1000
    lr = lr / 10000
    percent_drilled = percent_drilled / 100
    df = pd.read_csv('F9ADepth.csv')

    df_target = df.copy()

    droplist = [
        'nameWellbore', 'name', 'Pass Name unitless',
        'MWD Continuous Inclination dega', 'Measured Depth m',
        'MWD Continuous Azimuth dega', "Unnamed: 0", "Unnamed: 0.1"
    ]
    for i in droplist:
        df = df.drop(i, 1)

    for i in list(df):
        if df[i].count() < 1000:
            del df[i]
            info(f'dropped {i}')

    start = start
    stop = stop
    step = 0.230876

    X = np.arange(start, stop, step)
    X = X.reshape(X.shape[0], 1)

    X = np.arange(start, stop, step)
    X = X.reshape(X.shape[0], 1)

    my_data1 = sampling_fix(df_target, 'MWD Continuous Inclination dega',
                            start, stop, 1.7, 1, 0).predict(X)

    data_array = []

    for i in list(df):
        sampled = sampling_fix(df_target, i, start, stop, 1.7, 3, 0).predict(X)
        if np.isnan(np.sum(sampled)) == False:
            data_array.append(sampled)
            info(f'Using {i}')

    data_array = np.asarray(data_array)
    dftemp = pd.DataFrame()
    dftemp['dinc'] = my_data1
    dftemp['dinc'] = dftemp['dinc'].diff(1).rolling(3, center=True).mean()

    my_data1 = dftemp['dinc'].ffill().bfill()

    data_array = data_array.T

    pre_PCA_scaler = MinMaxScaler()
    data_array = pre_PCA_scaler.fit_transform(data_array)

    from sklearn.decomposition import PCA
    from sklearn.manifold import TSNE
    # =============================================================================
    #     pca = PCA().fit(data_array)
    #     plt.plot(np.cumsum(pca.explained_variance_ratio_))
    #     plt.xlabel('number of components')
    #     plt.ylabel('cumulative explained variance');
    #
    #     plt.show()
    # =============================================================================

    sampcount = int(len(data_array) * percent_drilled)

    pca = TSNE(n_components=newdim).fit(data_array[:sampcount])
    projected = pca.transform(data_array)

    my_data = []

    for i in range(newdim):
        my_data.append(projected[:, i])

    my_data1 = my_data1[:, np.newaxis]

    my_data_newaxis = []
    for i in my_data:
        my_data_newaxis.append(i[:, np.newaxis])

    temp_data1 = pd.DataFrame(my_data1.flatten())
    temp_data1 = pd.DataFrame(my_data1)

    range1 = temp_data1[0].diff(memory + predictions)

    range2 = np.amax(range1)

    RNN_scaler = MinMaxScaler()

    my_data1 = RNN_scaler.fit_transform(my_data1)

    my_data_scaled = []
    for i in my_data_newaxis:
        my_data_scaled.append(MinMaxScaler().fit_transform(i))

    X1 = prepareinput(my_data1, memory)

    Xdata = []

    for i in my_data_scaled:
        Xn = prepareinput_nozero(i, memory, predictions)
        Xdata.append(Xn)

    y_temp = prepareoutput(my_data1, memory, predictions)

    stack = []
    for i in range(memory):
        stack.append(np.roll(my_data1, -i))

    X_temp = np.hstack(stack)

    y = y_temp

    data_length = len(my_data1) - memory - predictions

    testing_cutoff = 0.80

    border1 = int((data_length) * (percent_drilled * 0.8))
    border2 = int((data_length) * (percent_drilled))
    border3 = int((data_length) * (percent_drilled + 0.2))

    X1_train = X1[:border1]
    X1_test = X1[border1:border2]
    X1_test2 = X1[border2:border3]

    Xdata_train = []
    Xdata_test = []
    Xdata_test2 = []

    for i in Xdata:
        Xdata_train.append(i[:border1])
        Xdata_test.append(i[border1:border2])
        Xdata_test2.append(i[border2:border3])

    y_train, y_test, y_test2 = y[:border1], y[border1:border2], y[
        border2:border3]

    X1_train = X1_train.reshape((X1_train.shape[0], X1_train.shape[1], 1))
    X1_test = X1_test.reshape((X1_test.shape[0], X1_test.shape[1], 1))
    X1_test2 = X1_test2.reshape((X1_test2.shape[0], X1_test2.shape[1], 1))

    Xdata_train_r = []
    Xdata_test_r = []
    Xdata_test2_r = []

    for i in range(newdim):
        Xdata_train_r.append(Xdata_train[i].reshape(
            (Xdata_train[i].shape[0], Xdata_train[i].shape[1], 1)))
        Xdata_test_r.append(Xdata_test[i].reshape(
            (Xdata_test[i].shape[0], Xdata_test[i].shape[1], 1)))
        Xdata_test2_r.append(Xdata_test2[i].reshape(
            (Xdata_test2[i].shape[0], Xdata_test2[i].shape[1], 1)))

    X_train_con = np.concatenate(Xdata_train_r, axis=2)
    X_test_con = np.concatenate(Xdata_test_r, axis=2)
    X_test2_con = np.concatenate(Xdata_test2_r, axis=2)

    X_train = [X1_train, X_train_con]
    X_test = [X1_test, X_test_con]
    X_test2 = [X1_test2, X_test2_con]

    input1 = Input(shape=(memory, 1))
    input2 = Input(shape=(memory + predictions, newdim))

    x1 = GaussianNoise(inc_layer2, input_shape=(memory, 1))(input1)

    x1 = GRU(units=inc_layer1,
             kernel_initializer='glorot_uniform',
             recurrent_initializer='orthogonal',
             bias_initializer='zeros',
             kernel_regularizer='l2',
             recurrent_regularizer=None,
             bias_regularizer=None,
             activity_regularizer=None,
             kernel_constraint=None,
             recurrent_constraint=None,
             bias_constraint=None,
             return_sequences=False,
             return_state=False,
             stateful=False)(x1)
    x1 = Dropout(drop1)(x1)

    x1 = Model(inputs=input1, outputs=x1)

    x2 = Dense(data_layer1, input_shape=(memory + predictions, 3))(input2)
    x2 = Dropout(drop2)(x2)
    x2 = Flatten()(x2)
    x2 = Dense(data_layer2)(x2)
    x2 = Model(inputs=input2, outputs=x2)

    combined = concatenate([x1.output, x2.output])

    z = Dense(dense_layer, activation="relu")(combined)
    z = Dense(predictions, activation="linear")(z)

    #define the model

    myadam = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=False)

    class PlotResuls(Callback):
        def on_train_begin(self, logs={}):
            self.i = 0
            self.x = []
            self.losses = []
            self.val_losses = []

            #self.fig = plt.figure()

            self.logs = []

        def on_epoch_end(self, epoch, logs={}):
            self.logs.append(logs)
            self.x.append(self.i)
            self.losses.append(logs.get('loss'))
            self.val_losses.append(logs.get('val_loss'))
            self.i += 1

            #print (".", end = '')
            if (epoch % 14999 == 0) & (epoch > 0):
                print(epoch)

                plt.plot(self.x, np.log(self.losses), label="loss")
                plt.plot(self.x, np.log(self.val_losses), label="val_loss")
                plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
                plt.title("Loss")
                plt.legend()
                plt.show()
                #mymanyplots(epoch, data, model)

    #data = [X1, X2, X3, X4, y, X1_train,X_train, X_test, X1_test, border1, border2, y_train, y_test, memory, y_temp, predictions]
    plot_results = PlotResuls()

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=25)
    ens_val_array = np.zeros(ensemble_count)
    ens_test_array = np.zeros(ensemble_count)

    for ens_no in range(ensemble_count):
        tf.keras.backend.clear_session()
        mc = ModelCheckpoint(f'best_model_ens_{ens_no}.h5',
                             monitor='val_loss',
                             mode='min',
                             save_best_only=True,
                             verbose=0)
        model = Model(inputs=[x1.input, x2.input], outputs=z)
        model.compile(optimizer=myadam, loss='mean_squared_error')
        history = model.fit(X_train,
                            y_train,
                            validation_data=(X_test, y_test),
                            epochs=2000,
                            verbose=0,
                            batch_size=bs,
                            callbacks=[plot_results, es, mc])

        model = load_model(f'best_model_ens_{ens_no}.h5')
        valresult = np.log(model.evaluate(x=X_test, y=y_test, verbose=0))
        testresult = np.log(model.evaluate(x=X_test2, y=y_test2, verbose=0))

        ens_val_array[ens_no] = valresult
        ens_test_array[ens_no] = testresult

    winner = ens_val_array.argmin()
    model = load_model(f'best_model_ens_{winner}.h5')

    info(ens_val_array)
    info(ens_test_array)
    info(f'Validation winner {winner}')
    sample_count = len(X_test2[0])
    y_pred = model.predict(X_test2)

    plt.plot(np.log(history.history['loss']), label='loss')
    plt.plot(np.log(history.history['val_loss']), label='test')
    plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
    plt.legend()
    plt.clf()
    plt.show()

    for i in range(5):
        rand = np.random.randint(0, len(X_test[0]))
        y_test_descaled = RNN_scaler.inverse_transform(y_test[rand,
                                                              np.newaxis])
        y_in_descaled = RNN_scaler.inverse_transform(X_test[0][rand, :])
        y_in_descaled = y_in_descaled.flatten()
        y_test_descaled = y_test_descaled.flatten()

        y_pred = model.predict(X_test)

        y_pred_descaled = RNN_scaler.inverse_transform(y_pred[rand,
                                                              np.newaxis])
        y_pred_descaled = y_pred_descaled.flatten()

        plt.plot(y_test_descaled, label="true")
        plt.plot(y_pred_descaled, label="predicted")

        plt.title('Inclination delta')
        #plt.ylim(0,1)
        plt.legend()
        plt.show()

        plt.figure(figsize=(5, 4))
        x_after = np.linspace(0, 23, 100)
        x_before = np.linspace(-23, -0.23, 100)

        plt.plot(x_before,
                 np.cumsum(y_in_descaled),
                 label="measured",
                 linestyle="-",
                 c="black")
        commonpoint = np.cumsum(y_in_descaled)[-1]
        plt.plot(x_after,
                 commonpoint + np.cumsum(y_test_descaled),
                 label="actual",
                 linestyle='-.',
                 c='black')
        plt.plot(x_after,
                 commonpoint + np.cumsum(y_pred_descaled),
                 label="predicted",
                 linestyle=':',
                 c='black')
        #plt.title('')
        plt.ylim(-1, 7)
        plt.grid()
        plt.tight_layout()
        #plt.hlines(0, -23, 23, linewidth=0.5)
        plt.xlabel("Distance to sensor [m]")
        plt.ylabel("Inclination, local coordinates, [deg]")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'Sample, {percent_drilled}, no.{i}.pdf')
        plt.show()

    # #### Different ensemble, voting ######
    # ypred_array = []
    # for i in range(ensemble_count):
    #     model = load_model(f'best_model_ens_{i}.h5')
    #     y_pred = model.predict(X_test2)
    #     ypred_array.append(y_pred)

    # y_pred = np.average(ypred_array, axis=0)

    # ######## Different ensemble ends here #

    y_test_descaled = RNN_scaler.inverse_transform(y_test2)
    y_pred = model.predict(X_test2)
    y_pred_descaled = RNN_scaler.inverse_transform(y_pred)

    error_matrix = np.cumsum(y_pred_descaled, axis=1) - np.cumsum(
        y_test_descaled, axis=1)

    def rand_jitter(arr):
        stdev = .004 * (max(arr) - min(arr))
        return arr + np.random.randn(len(arr)) * stdev

    def jitter(x,
               y,
               s=20,
               c='b',
               marker='o',
               cmap=None,
               norm=None,
               vmin=None,
               vmax=None,
               alpha=None,
               linewidths=None,
               verts=None,
               **kwargs):
        return plt.scatter(rand_jitter(x),
                           rand_jitter(y),
                           s=s,
                           c=c,
                           marker=marker,
                           cmap=cmap,
                           norm=norm,
                           vmin=vmin,
                           vmax=vmax,
                           alpha=alpha,
                           linewidths=linewidths,
                           verts=verts,
                           **kwargs)

    plt.figure(figsize=(5, 5), dpi=200)
    for i in range(sample_count):
        _ = jitter(x_after,
                   error_matrix[i],
                   alpha=1,
                   s=0.5,
                   marker=".",
                   c="black")
    plt.title(f"delta, drilled {percent_drilled}")
    plt.xlabel("Distance to sensor [m]")
    plt.ylabel("Prediction error [deg]")
    plt.grid()
    plt.tight_layout()
    plt.savefig(f'Birdflock, {percent_drilled}.pdf')
    plt.show()
    #plt.plot(np.median(error_matrix, axis=0), linewidth=8, alpha=1, c="white")
    #plt.plot(np.median(error_matrix, axis=0), linewidth=2, alpha=1, c="black")
    plt.scatter(np.arange(0, 100, 1),
                np.average(np.abs(error_matrix), axis=0),
                marker="o",
                s=40,
                alpha=0.7,
                c="white",
                zorder=2)

    c_array = np.empty(100, dtype=object)
    aae = np.average(np.abs(error_matrix), axis=0)
    for i in range(100):
        if aae[i] <= 0.4:
            c_array[i] = "green"
        elif aae[i] <= 0.8:
            c_array[i] = "orange"
        else:
            c_array[i] = "red"

    plt.scatter(np.arange(0, 100, 1),
                aae,
                marker=".",
                s=20,
                alpha=1,
                c=c_array,
                zorder=3,
                label="Average Absolute Error")
    plt.ylim((-3, 3))
    plt.axhline(y=0, xmin=0, xmax=1, linewidth=2, c="black")
    plt.axhline(y=0.4, xmin=0, xmax=1, linewidth=1, c="black")
    plt.axhline(y=0.8, xmin=0, xmax=1, linewidth=1, c="black")
    plt.legend()
    plt.show()

    model = load_model('best_model.h5')
    #mymanyplots(-1, data, model)
    #myerrorplots(data, model)
    valresult = np.log(model.evaluate(x=X_test, y=y_test, verbose=0))
    testresult = np.log(model.evaluate(x=X_test2, y=y_test2, verbose=0))

    return valresult, testresult, aae
# 2 t-SNE ---------------------------------------------------------------------------

# パラメータの設定
n_components = 2
learning_rate = 300
perplexity = 30
early_exaggeration = 12
init = 'random'
random_state = 2018

# インスタンス生成
tSNE = TSNE(n_components=n_components,
            learning_rate=learning_rate,
            perplexity=perplexity,
            early_exaggeration=early_exaggeration,
            init=init,
            random_state=random_state)

# 学習器の作成
tSNE.fit(X_train_PCA.loc[:5000, :9])

# 学習器の適用
X_train_tSNE = tSNE.transform(X_train_PCA.loc[:5000, :9])

# データフレームに変換
X_train_tSNE = pd.DataFrame(data=X_train_tSNE, index=train_index[:5001])

# プロット表示
scatterPlot(X_train_tSNE, y_train, "t-SNE")
Exemple #16
0
def get_representation(embeddings):
  tsne = TSNE()
  tsne.fit(embeddings)
  return tsne, tsne.transform(embeddings)
            """for row_id in range(0, 15):
                target_word = word2vec.words()[row_id]
                x = reduced_matrix[row_id, 0]
                y = reduced_matrix[row_id, 1]
                print("{} = ({}, {})".format(target_word, x, y))
                plt.annotate(target_word, (x, y))
            # end for"""
            """for index, pred in enumerate(predictions):
                for word in pred.keys():
                    reducted_vector = model.transform(pred[word])
                    plt.scatter(reducted_vector[:, 0], reducted_vector[:, 1], 10)
                    plt.annotate(word, (reducted_vector[0, 0], reducted_vector[0, 1]))
                # end for
            # end for"""
            for word in average_vectors:
                reducted_vector = model.transform(average_vectors[word])
                plt.scatter(reducted_vector[0, 0], reducted_vector[0, 1], 10)
                plt.annotate(word,
                             (reducted_vector[0, 0], reducted_vector[0, 1]),
                             arrowprops=dict(facecolor='red', shrink=0.025))
            # end for
            plt.show()
        # end if

        # Continue
        answer = raw_input("Continue? ").lower()
        if answer == "n":
            cont = False
        # end if

        # Reset reservoir
Exemple #18
0
def main(cfg):
    model_dir = os.path.abspath(cfg["model_dir"])
    X_test, Y_test = get_data(cfg)
    print(f"Data loaded. X_test shape: {X_test.shape}, Y_test shape: " \
            f"{Y_test.shape}")
    # Binarize outcome if need be
    Y_test[Y_test >= 0.5] = 1
    Y_test[Y_test < 0.5] = 0

    model = load_model(model_dir)
    model.summary()
    print("Model loaded")

    if cfg["task"].startswith("dpsom"):
        probas_test = model.predict(X_test)
    else:
        probas_test = model.predict([X_test[:, :, 7:], X_test[:, :, :7]])
    ix_pred_a = (probas_test < 0.5).flatten()
    ix_pred_d = (probas_test >= 0.5).flatten()
    ix_a = (Y_test == 0).flatten()
    ix_d = (Y_test == 1).flatten()
    ix_tn = ix_a & ix_pred_a
    ix_fp = ix_a & ix_pred_d
    ix_fn = ix_d & ix_pred_a
    ix_tp = ix_d & ix_pred_d
    X_anl, Y_anl = get_analysis_subsets(X_test, Y_test,
                                        cfg["num_for_analysis"])

    if cfg["write_out"]:
        pickle.dump(X_test, open(pj(bm_config.output_dir, "X_test.pkl"), "wb"))
        pickle.dump(Y_test, open(pj(bm_config.output_dir, "Y_test.pkl"), "wb"))
        # Note, data are *right-padded*, i.e. padded with zeros to the right
        # if there < 200 actual data samples
        # Y_test is {0,1}, 1 = death, about 12% mortality

    if cfg["cluster"]:
        bilstm_name = "bilstm_2"
        bilstm_layer = model.get_layer(bilstm_name)
        bilstm_layer.return_sequences = True
        bilstm_model = Model(inputs=model.input, outputs=bilstm_layer.output)
        if cfg["task"].startswith("dpsom"):
            bilstm_seqs = bilstm_model.predict(X_test)
        else:
            bilstm_seqs = bilstm_model.predict(
                [X_test[:, :, 7:], X_test[:, :, :7]])
        print("Shape of BiLSTM output:", bilstm_seqs.shape)
        bilstm_seqs = np.concatenate(
            [bilstm_seqs[:, :, :64], bilstm_seqs[:, ::-1, 64:]], axis=2)

        reducer = cfg["reducer"]
        if reducer == "tsne":
            reducer_model = TSNE(n_components=2)
        elif reducer == "isomap":
            reducer_model = Isomap(n_components=2,
                                   n_neighbors=cfg["n_neighbors"])
        else:
            raise NotImplementedError(reducer)
        probas_out = bilstm_seqs[:, -1, :]
        print("Shape of final probas matrix:", probas_out.shape)
        print(f"Fitting {reducer} model...")
        proj_X = reducer_model.fit_transform(probas_out)
        # Should really be training tsne with training data but oh well
        print("...Done")

        plt.figure(figsize=(16, 16))
        plt.scatter(proj_X[ix_tn, 0], proj_X[ix_tn, 1], s=12, c="r")
        plt.scatter(proj_X[ix_fn, 0], proj_X[ix_fn, 1], s=12, c="g")
        plt.scatter(proj_X[ix_fp, 0], proj_X[ix_fp, 1], s=12, c="y")
        plt.scatter(proj_X[ix_tp, 0], proj_X[ix_tp, 1], s=12, c="b")
        plt.savefig(pj(model_dir, f"{reducer}.png"))
        plt.close()

        inc = cfg["plot_every_nth"]
        slices_dir = pj(model_dir, f"{reducer}_slices")
        if not pe(slices_dir):
            os.makedirs(slices_dir)
        seq_len = bilstm_seqs.shape[1]
        start_idx = seq_len - cfg["plot_last_n"]

        bilstm_seqs = bilstm_seqs[::inc, start_idx:]
        print("Creating sequence projections...")
        data_mat = np.zeros((bilstm_seqs.shape[0], bilstm_seqs.shape[1], 2))
        for j in range(seq_len - start_idx):
            slice_j = bilstm_seqs[:, j, :]
            data_mat[:, j, :] = reducer_model.transform(slice_j)
        print("...Done")
        color_d = {
            "r": (ix_tn[::inc], 12),
            "g": (ix_fn[::inc], 24),
            "y": (ix_fp[::inc], 12),
            "b": (ix_tp[::inc], 24)
        }
        trajectories = Trajectories(data_mat,
                                    color_dict=color_d,
                                    final_extra=20)
        trajectories.save(pj(model_dir, f"{reducer}_{len(data_mat)}.gif"))
        plt.show()

    # Uses all subjects
    if cfg["confusion_matrix"]:
        print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")
        print(f"Inferred probabilities, output shape {probas_test.shape}")

        fpr_mort, tpr_mort, thresholds = roc_curve(Y_test, probas_test)
        roc_auc_mort = auc(fpr_mort, tpr_mort)
        TN, FP, FN, TP = confusion_matrix(Y_test, probas_test.round()).ravel()
        PPV = TP / (TP + FP)
        NPV = TN / (TN + FN)

        cm = np.array([[TN, FP], [FN, TP]])
        save_path = pj(cfg["model_dir"], "confusion_matrix.png")
        classes = ["False", "True"]
        plot_confusion_matrix(cm,
                              save_path,
                              classes,
                              normalize=False,
                              title='Confusion matrix')

        print("Inference:")
        print(f"PPV: {PPV:0.4f}, NPV: {NPV:0.4f}, roc_auc: " \
                "{roc_auc_mort:0.4f}")
Exemple #19
0
print('te_pca shape*****', te_pca.shape)
print(te_pca.head())

X_train = tr_pca.values
X_test = te_pca.values

'''
tsne time
'''

tsne = TSNE(n_components=3, perplexity=40, verbose=2)
X_train_embedded = tsne.fit_transform(X_train)


X_test_embedded = tsne.transform(X_test) this does not exists
we will need to change it to https://github.com/kylemcdonald/Parametric-t-SNE/blob/master/Parametric%20t-SNE%20(Keras).ipynb


train_principalDf  = pd.DataFrame(data = X_train_embedded, columns = ['tsne_1', 'tsne_2', 'tsne_3'])
tr = pd.concat([train_principalDf, train[['SK_ID_CURR']]], axis = 1)
print('tr shape', tr.shape)
print(tr.head())


test_principalDf  = pd.DataFrame(data = X_test_embedded, columns = ['tsne_1', 'tsne_2', 'tsne_3'])
te = pd.concat([test_principalDf, test[['SK_ID_CURR']]], axis = 1)
print('te shape', te.shape)
print(te.head())

tr_te = tr.append(te).reset_index()
Exemple #20
0
    def visualize_clusters(self,
                           algorithm=None,
                           fit: bool = True,
                           xlim=None,
                           ylim=None,
                           cmap=None,
                           markers=None,
                           markersize=None,
                           filename: str = None,
                           block: bool = True):
        """Visualize the clusters from create_clusters().

        Args:
            algorithm: the visualization algorithm to map data into 2D (default TSNE).
            fit: True means fit the data, False means algorithm is pre-trained, so use it
                to just transform the data into 2D without fitting the data first.
                Note that TSNE does not support fit=False yet.
                If you want fit=False, use another dimension-reduction algorithm like PCA(...).
            xlim (Pair[float,float]): optional axis limits for the X axis.
            ylim (Pair[float,float]): optional axis limits for the Y axis.
            cmap (Union[ColorMap,str]): optional color map for the cluster colors,
                or the name of a color map.
                See https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html.
                Default is 'brg', which has a wide range of
                colors going from blue through red to green, and prints in black and white
                okay - though very non-linear - because it does not go all the way to white.
            markers (matplotlib.markers.MarkerStyle): optional marker styles for clusters.
                If this is a string, then the i'th character in the string will be used for
                the i'th marker style.  See https://matplotlib.org/3.1.1/api/markers_api.html
                for the available marker characters.  Note that clusters will be drawn from 0
                up to n-1, so later clusters will be on top.  Also, the earlier clusters tend
                to have more elements.  One approach to improve readability is to use line-based
                shapes (from "1234+x|_") for the first few clusters (which have many points),
                and then filled shapes (from ".o<^>vsphPXd*") for the later clusters
                (which have few points).  Note also that you can use a space for the marker
                character of a cluster if you want to not display that cluster at all.
                However, if your markers string is shorter than the number of clusters,
                all remaining clusters will be displayed using the "o" marker.
            markersize (float): size of the markers in points (only when markers is a str).
                The default seems to be about 6 points.
            filename (str): optional file name to save image into, as well as displaying it.
            block (bool): True (the default) means wait for user to close figure before
                returning.  False means non-blocking.

            Limitations: if you call this multiple times with different numbers of clusters,
                the color map will not be exactly the same.
        """
        data = self._cluster_data
        if data is None or self._clusters is None:
            raise Exception(
                "You must call create_clusters() before visualizing them!")
        num_clusters = max(self._clusters) + 1
        if algorithm is None:
            if not fit:
                raise Exception(
                    "You must supply pre-fitted algorithm when fit=False")
            algorithm = TSNE()
        alg_name = str(algorithm).split("(")[0]
        self.message(f"running {alg_name} on {len(data)} traces.")
        if fit:
            tsne_obj = algorithm.fit_transform(data)
        else:
            tsne_obj = algorithm.transform(data)
        # print(tsne_obj[0:5])

        # All the following complex stuff is for adding a 'show label on mouse over' feature
        # to the visualisation scatter graph.
        # It works when run from command line, but not in Jupyter/Spyder!
        # Surely there must be an easier way than doing all this...
        # Code adapted from:
        # https://stackoverflow.com/questions/55891285/how-to-make-labels-appear-
        #     when-hovering-over-a-point-in-multiple-axis/55892690#55892690
        fig, ax = plt.subplots(
        )  # figsize=(8, 6))  # 25% larger, for better printing
        if xlim:
            ax.set_xlim(xlim)
        if ylim:
            ax.set_ylim(ylim)
        if cmap is None:
            # Choose a default colormap.  See bottom of the matplotlib page:
            #   https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html
            cmap = pltcm.get_cmap(
                'brg')  # sequential map with nice b&w printing.
        elif isinstance(cmap, str):
            cmap = pltcm.get_cmap(
                cmap)  # it is the name of a matplotlib color map
        if markers is None:
            markers = "o"
        if isinstance(markers, str) and len(markers) > 1:
            # loop through the marker styles
            clusters = np.ma.array(self._clusters)
            markchars = markers + "o" * num_clusters
            for curr in range(max(num_clusters, len(markers))):
                #prepare for masking arrays - 'conventional' arrays won't do it
                mask = clusters != curr  # False means unmasked
                x_masked = np.ma.array(tsne_obj[:, 0], mask=mask)
                y_masked = np.ma.array(tsne_obj[:, 1], mask=mask)
                color = cmap(curr / num_clusters)
                # c_masked = np.ma.array(clusters, mask=mask)
                # print(f"DEBUG:  mark {curr} is '{markers[curr]}' x={x_masked[0:10]} cl={c_masked[0:10]} color={color}")
                sc = ax.plot(x_masked,
                             y_masked,
                             color=color,
                             linewidth=0,
                             label=f"c{curr}",
                             marker=markchars[curr],
                             markersize=markersize)
            leg = ax.legend(
                loc='best'
            )  #, ncol=2, mode="expand", shadow=True, fancybox=True)
            leg.get_frame().set_alpha(0.5)
        else:
            sc = plt.scatter(tsne_obj[:, 0],
                             tsne_obj[:, 1],
                             c=self._clusters,
                             cmap=cmap,
                             marker=markers)

        if filename:
            plt.savefig(filename)
        names = [str(tr) for tr in self.traces
                 ]  # these are in same order as tsne_df rows.

        annot = ax.annotate(
            "",
            xy=(0, 0),
            xytext=(0, 20),
            textcoords="offset points",
            bbox=dict(boxstyle="round", fc="w"),
            arrowprops=dict(arrowstyle="->"),
        )
        annot.set_visible(False)

        def update_annot(ind):
            pos = sc.get_offsets()[ind["ind"][0]]
            annot.xy = pos
            # text = "{}, {}".format(" ".join(list(map(str, ind["ind"]))),
            #                        " ".join([str(names[n]) for n in ind["ind"]]))
            anns = [
                f"{n} ({self._clusters[n]}): {str(names[n])}"
                for n in ind["ind"]
            ]
            text = "\n".join(anns)
            annot.set_text(text)
            # annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]])))
            # annot.get_bbox_patch().set_alpha(0.4)

        def hover(event):
            vis = annot.get_visible()
            if event.inaxes == ax:
                cont, ind = sc.contains(event)
                if cont:
                    update_annot(ind)
                    annot.set_visible(True)
                    fig.canvas.draw_idle()
                else:
                    if vis:
                        annot.set_visible(False)
                        fig.canvas.draw_idle()

        fig.canvas.mpl_connect("motion_notify_event", hover)
        plt.show(block=block)