def tsne(self, learning_rate=100): print('Calculating t-distributed stochastic neighbor embedding....\n') start = time.time() tsne = TSNE(n_components=self.n_components, learning_rate=learning_rate) tsne_array = tsne.fit_transform(self.X) #plot2D(tsne_array,self.y,'t-SNE','t-SNE: 1078 cells with 10 cell subtypes',time.time() - start) tsne_params = tsne.get_params() return tsne_array
def learn_tsne(data, **kwargs): """ Calculates TSNE transform for given matrix features :param data: array of features :param kwargs: arguments for sklearn.manifold.TSNE :return: np.ndarray with calculated TSNE transform """ _tsne_filter = TSNE.get_params(TSNE) kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter} res = TSNE(random_state=0, **kwargs).fit_transform(data.values) return pd.DataFrame(res, index=data.index.values)
def show_clusters(data, y, name, params=None): model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) if params is not None: model.set_params(**params) X = model.fit_transform(data) print X p = model.get_params() print "X.shape = ", X.shape print "y.shape = ", y.shape print y plt.scatter(X[:,0], X[:,1], c=y) plt.gray() plt.axis('off') plt.show() plt.savefig("ClustersUntrained{}.png".format(name), dpi=600) plt.clf() return p
def learn_tsne(data, **kwargs): """ Calculates TSNE transformation for given matrix features. Parameters -------- data: np.array Array of features. kwargs: optional Parameters for ``sklearn.manifold.TSNE()`` Returns ------- Calculated TSNE transform Return type ------- np.ndarray """ _tsne_filter = TSNE.get_params(TSNE) kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter} res = TSNE(random_state=0, **kwargs).fit_transform(data.values) return pd.DataFrame(res, index=data.index.values)
cmap=CMAP, s=40) ax.set_title(title, fontsize=20, y=1.03) #fsize = 14 ax.set_xlabel("1st eigenvector") ax.set_ylabel("2nd eigenvector") ax.set_zlabel("3rd eigenvector") #ax.legend(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']) ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) tsne = TSNE(n_components=2) print(tsne.get_params()) points = tsne.fit_transform(iris_df[features]) plot_iris_2d(x = points[:, 0],y = points[:, 1],title = 'Iris dataset visualized with t-SNE') tsne = TSNE(n_components=3) print(tsne.get_params()) points = tsne.fit_transform(iris_df[features]) plot_iris_3d( x = points[:,0], y = points[:,1], z = points[:,2], title = "Iris dataset visualized with tSNE") tsne = TSNE(n_components=3,metric='correlation') print(tsne.get_params()) #tsne.set_params('metric':'correlation')
------------------------------------------------------------------------------- -------------------------------------t-SNE------------------------------------- ------------------------------------------------------------------------------- ''' #Build TSNE model, learning rate defaults to 1000 but usually best around 200 #Perplexity balances local and global aspects of neighbors, usually best between 5 and 50 tsne_model = TSNE(n_components=2, perplexity=30.0, learning_rate=100.0, n_iter=2000, n_iter_without_progress=30, random_state=seed, method='barnes_hut') tsne_model.fit_transform(x_std) print(tsne_model.get_params()) tsne_dim = tsne_model.embedding_ print(tsne_dim.shape) #There should be 2 latent variables represented #print('Kullback-Leibler divergence:', tsne_model.kl_divergence_) #Plot first 2 extracted features and the observation class plt.figure(figsize=(10, 5)) plt.xlabel('Latent Variable 1') plt.ylabel('Latent Variable 2') plt.title( 't-SNE 2-Dimension Plot with Observation Class \nperplexity=30, learning_rate=100' ) plt.scatter(tsne_dim[:, 0], tsne_dim[:, 1], c=y) plt.colorbar() plt.show()
""" Created on Fri Oct 10 14:30:42 2014 @author: junhao """ import numpy as np import pandas as pd from sklearn.manifold import TSNE import matplotlib.pyplot as plt df = pd.read_csv('iris.data', delimiter=',', header=None) df = df.dropna() print df.head() c = df.iloc[:, 4] f = pd.Categorical.from_array(c) print f.labels model = TSNE(n_components=2, init='random', early_exaggeration=4, learning_rate=200, random_state=1) xy = model.fit_transform(df.iloc[:, 0:3]) plt.scatter(xy[:, 0], xy[:, 1], c=f.labels) ax.legend() plt.show() print model.get_params()
def get(self, _) -> Tuple[dict, int]: """The view method to perform a t-distributed Stochastic Neighbor Embedding on the alloy compositions. t-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results. Returns: A valid HTTP Response with a dictionary of data and a status code. """ # Because our data is stored inside User Documents in the format: # [ # { # "_id": ObjectId(), # "name": "Alloy name", # "compositions": [ # { "symbol": "C", "weight": 0.044 }, # { "symbol": "Mn", "weight": 0.021 }, # { "symbol": "Fe", "weight": 0.0 }, # ] # } # ] pipeline = [ # Stage 1 - Unwind the array of saved alloys and project fields we # need which will be `_id`, `name`, and `compositions`. { '$unwind': '$saved_alloys' }, { '$project': { '_id': 1, 'name': '$saved_alloys.name', 'compositions': '$saved_alloys.compositions' } }, # Stage 2 - Unwind the list of compositions which are element # objects and group them by `_id` and `name` { '$unwind': '$compositions' }, { '$group': { '_id': { 'id': '$_id', 'name': '$name' }, 'items': { '$addToSet': { 'name': '$compositions.symbol', 'value': '$compositions.weight' } }, } }, # Stage 3 - Project the grouped elements which are an array of # symbols and weights to a object which does a pivot on the # `items.name` and `items.value` where the name because the column # and the value becomes the value of that row. { '$project': { 'result': { '$arrayToObject': { '$zip': { 'inputs': ["$items.name", "$items.value"] } } } } }, # Stage 4 - Add the Alloy name field to the result and change the # $$ROOT to be this new `result`. { '$addFields': { 'result.name': '$_id.name' } }, { '$replaceRoot': { 'newRoot': '$result' } } ] # Run our querying pipeline and get the result as a `pandas.DataFrame` df = MongoService().read_aggregation(db_name=DATABASE, collection='users', pipeline=pipeline) # Sometimes the list of compositions can have mismatching number of # elements (because a user has added an alloy with more elements). # In this case, we will have NaN for those lists that don't have those # extra elements. We need to fill this with 0.0 as our imputation # technique. df = df.fillna(0) # Separate out the labels of the alloy into a new DataFrame labels_df = pd.DataFrame(data=df['name'].values, columns=['name']) labels_df.reset_index(drop=True, inplace=True) # Separating out the features (i.e. the vector of element weights). df.drop(['name'], axis=1, inplace=True) X = df.loc[:].values # Now we make out t-SNE model using sklearn. # sklearn Documentation: # https://scikit-learn.org/stable/modules/generated/ # sklearn.manifold.TSNE.html # Args: # n_components: Dimension of the embedded space. # perplexity: The perplexity is related to the number of nearest # neighbors that is used in other manifold learning algorithms. # Larger datasets usually require a larger perplexity. Consider # selecting a value between 5 and 50. Different values can result # in significantly different results. # learning_rate: The learning rate for t-SNE is usually in the # range [10.0, 1000.0]. If the learning rate is too high, the data # may look like a ‘ball’ with any point approximately equidistant # from its nearest neighbours. If the learning rate is too low, # most points may look compressed in a dense cloud with few outliers. # If the cost function gets stuck in a bad local minimum increasing # the learning rate may help. # n_iter: Maximum number of iterations for the optimization. Should # be at least 250. tsne_model = TSNE(n_components=2, perplexity=40, learning_rate=200., n_iter=250) # We fit the model to the dataset tsne_embedded = tsne_model.fit_transform(X) # Create a new aggregated dataframe to store the results tsne_df = pd.DataFrame(data=tsne_embedded, columns=['x', 'y']) tsne_df.reset_index(drop=True, inplace=True) # Join our labels to our 2-dimensional Array tsne_df = pd.concat([tsne_df, labels_df], axis=1, ignore_index=True) tsne_df.columns = ['x', 'y', 'name'] # Generate some colour codes # Assign each name category a unique code which represents the color # of the markers in Plotly. tsne_df = tsne_df.assign( color=(tsne_df['name']).astype('category').cat.codes) # View the params used in the model and return that in the response params = tsne_model.get_params() response = { 'status': 'success', 'parameters': params, 'data': { 'x': tsne_df['x'].tolist(), 'y': tsne_df['y'].tolist(), 'label': tsne_df['name'].tolist(), 'color': tsne_df['color'].tolist(), } } return response, 200
# -*- coding: utf-8 -*- """ Created on Fri Oct 10 14:30:42 2014 @author: junhao """ import numpy as np import pandas as pd from sklearn.manifold import TSNE import matplotlib.pyplot as plt df = pd.read_csv('iris.data',delimiter=',', header=None) df = df.dropna() print df.head() c= df.iloc[:,4] f = pd.Categorical.from_array(c) print f.labels model=TSNE(n_components=2, init='random', early_exaggeration=4, learning_rate=200, random_state=1) xy= model.fit_transform(df.iloc[:,0:3]) plt.scatter(xy[:,0],xy[:,1], c=f.labels) ax.legend() plt.show() print model.get_params()
if "scalarMap" not in color_group: color_group.require_dataset( "scalarMap", shape=(1, ), dtype=vlen_uint8)[0] = pickle4h5( scalarMap) color_group.require_dataset( "colors", shape=point_colors.shape, dtype=point_colors.dtype)[ ...] = point_colors ncomp_group = sample_group.require_group( f"{n_components}D") embed_group = ncomp_group for p_name, p_value in tsne.get_params( ).items(): if p_name in useful_tsne_params: embed_group = embed_group.require_group( f"{p_name}: {p_value}") embed_group.require_dataset( "tsne", shape=(1, ), dtype=vlen_uint8)[0] = pickle4h5(tsne) embed_group.require_dataset( "embeded_points", shape=embeded_points.shape, dtype=embeded_points.dtype)[ ...] = embeded_points embed_group.attrs[ "embeded_pt_mean_dist"] = embeded_pt_mean_dist except Exception as e: print(f"Failed for some reason {e}")