Beispiel #1
0
 def tsne(self, learning_rate=100):
     print('Calculating t-distributed stochastic neighbor embedding....\n')
     start = time.time()
     tsne = TSNE(n_components=self.n_components,
                 learning_rate=learning_rate)
     tsne_array = tsne.fit_transform(self.X)
     #plot2D(tsne_array,self.y,'t-SNE','t-SNE: 1078 cells with 10 cell subtypes',time.time() - start)
     tsne_params = tsne.get_params()
     return tsne_array
Beispiel #2
0
def learn_tsne(data, **kwargs):
    """
    Calculates TSNE transform for given matrix features

    :param data: array of features
    :param kwargs: arguments for sklearn.manifold.TSNE
    :return: np.ndarray with calculated TSNE transform
    """
    _tsne_filter = TSNE.get_params(TSNE)
    kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter}
    res = TSNE(random_state=0, **kwargs).fit_transform(data.values)
    return pd.DataFrame(res, index=data.index.values)
def show_clusters(data, y, name, params=None):
    model = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    if params is not None:
        model.set_params(**params)
    X = model.fit_transform(data)
    print X
    p = model.get_params()
    print "X.shape = ", X.shape
    print "y.shape = ", y.shape
    print y
    plt.scatter(X[:,0], X[:,1], c=y)
    plt.gray()
    plt.axis('off')
    plt.show()
    plt.savefig("ClustersUntrained{}.png".format(name), dpi=600)
    plt.clf()
    return p
Beispiel #4
0
def learn_tsne(data, **kwargs):
    """
    Calculates TSNE transformation for given matrix features.

    Parameters
    --------
    data: np.array
        Array of features.
    kwargs: optional
        Parameters for ``sklearn.manifold.TSNE()``

    Returns
    -------
    Calculated TSNE transform

    Return type
    -------
    np.ndarray
    """
    _tsne_filter = TSNE.get_params(TSNE)
    kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter}
    res = TSNE(random_state=0, **kwargs).fit_transform(data.values)
    return pd.DataFrame(res, index=data.index.values)
               cmap=CMAP,
               s=40)
    
    ax.set_title(title, fontsize=20, y=1.03)
    
    #fsize = 14
    ax.set_xlabel("1st eigenvector")
    ax.set_ylabel("2nd eigenvector")
    ax.set_zlabel("3rd eigenvector")
    #ax.legend(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])

tsne = TSNE(n_components=2)
print(tsne.get_params())
points = tsne.fit_transform(iris_df[features])
plot_iris_2d(x = points[:, 0],y = points[:, 1],title = 'Iris dataset visualized with t-SNE')

tsne = TSNE(n_components=3)
print(tsne.get_params())
points = tsne.fit_transform(iris_df[features])
plot_iris_3d(
    x = points[:,0],
    y = points[:,1],
    z = points[:,2],
    title = "Iris dataset visualized with tSNE")

tsne = TSNE(n_components=3,metric='correlation')
print(tsne.get_params())
#tsne.set_params('metric':'correlation')
-------------------------------------------------------------------------------
-------------------------------------t-SNE-------------------------------------
-------------------------------------------------------------------------------
'''

#Build TSNE model, learning rate defaults to 1000 but usually best around 200
#Perplexity balances local and global aspects of neighbors, usually best between 5 and 50
tsne_model = TSNE(n_components=2,
                  perplexity=30.0,
                  learning_rate=100.0,
                  n_iter=2000,
                  n_iter_without_progress=30,
                  random_state=seed,
                  method='barnes_hut')
tsne_model.fit_transform(x_std)
print(tsne_model.get_params())
tsne_dim = tsne_model.embedding_
print(tsne_dim.shape)  #There should be 2 latent variables represented
#print('Kullback-Leibler divergence:', tsne_model.kl_divergence_)

#Plot first 2 extracted features and the observation class
plt.figure(figsize=(10, 5))
plt.xlabel('Latent Variable 1')
plt.ylabel('Latent Variable 2')
plt.title(
    't-SNE 2-Dimension Plot with Observation Class \nperplexity=30, learning_rate=100'
)
plt.scatter(tsne_dim[:, 0], tsne_dim[:, 1], c=y)
plt.colorbar()
plt.show()
Beispiel #7
0
"""
Created on Fri Oct 10 14:30:42 2014

@author: junhao
"""
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

df = pd.read_csv('iris.data', delimiter=',', header=None)
df = df.dropna()
print df.head()
c = df.iloc[:, 4]

f = pd.Categorical.from_array(c)
print f.labels

model = TSNE(n_components=2,
             init='random',
             early_exaggeration=4,
             learning_rate=200,
             random_state=1)
xy = model.fit_transform(df.iloc[:, 0:3])

plt.scatter(xy[:, 0], xy[:, 1], c=f.labels)
ax.legend()
plt.show()

print model.get_params()
Beispiel #8
0
    def get(self, _) -> Tuple[dict, int]:
        """The view method to perform a t-distributed Stochastic Neighbor
        Embedding on the alloy compositions.

        t-SNE [1] is a tool to visualize high-dimensional data. It converts
        similarities between data points to joint probabilities and tries to
        minimize the Kullback-Leibler divergence between the joint
        probabilities of the low-dimensional embedding and the
        high-dimensional data. t-SNE has a cost function that is not convex,
        i.e. with different initializations we can get different results.

        Returns:
            A valid HTTP Response with a dictionary of data and a status code.
        """

        # Because our data is stored inside User Documents in the format:
        # [
        #   {
        #       "_id": ObjectId(),
        #       "name": "Alloy name",
        #       "compositions": [
        #           { "symbol": "C", "weight": 0.044 },
        #           { "symbol": "Mn", "weight": 0.021 },
        #           { "symbol": "Fe", "weight": 0.0 },
        #       ]
        #   }
        # ]
        pipeline = [
            # Stage 1 - Unwind the array of saved alloys and project fields we
            # need which will be `_id`, `name`, and `compositions`.
            {
                '$unwind': '$saved_alloys'
            },
            {
                '$project': {
                    '_id': 1,
                    'name': '$saved_alloys.name',
                    'compositions': '$saved_alloys.compositions'
                }
            },
            # Stage 2 - Unwind the list of compositions which are element
            # objects and group them by `_id` and `name`
            {
                '$unwind': '$compositions'
            },
            {
                '$group': {
                    '_id': {
                        'id': '$_id',
                        'name': '$name'
                    },
                    'items': {
                        '$addToSet': {
                            'name': '$compositions.symbol',
                            'value': '$compositions.weight'
                        }
                    },
                }
            },
            # Stage 3 - Project the grouped elements which are an array of
            # symbols and weights to a object which does a pivot on the
            # `items.name` and `items.value` where the name because the column
            # and the value becomes the value of that row.
            {
                '$project': {
                    'result': {
                        '$arrayToObject': {
                            '$zip': {
                                'inputs': ["$items.name", "$items.value"]
                            }
                        }
                    }
                }
            },
            # Stage 4 - Add the Alloy name field to the result and change the
            # $$ROOT to be this new `result`.
            {
                '$addFields': {
                    'result.name': '$_id.name'
                }
            },
            {
                '$replaceRoot': {
                    'newRoot': '$result'
                }
            }
        ]

        # Run our querying pipeline and get the result as a `pandas.DataFrame`
        df = MongoService().read_aggregation(db_name=DATABASE,
                                             collection='users',
                                             pipeline=pipeline)

        # Sometimes the list of compositions can have mismatching number of
        # elements (because a user has added an alloy with more elements).
        # In this case, we will have NaN for those lists that don't have those
        # extra elements. We need to fill this with 0.0 as our imputation
        # technique.
        df = df.fillna(0)

        # Separate out the labels of the alloy into a new DataFrame
        labels_df = pd.DataFrame(data=df['name'].values, columns=['name'])
        labels_df.reset_index(drop=True, inplace=True)

        # Separating out the features (i.e. the vector of element weights).
        df.drop(['name'], axis=1, inplace=True)
        X = df.loc[:].values

        # Now we make out t-SNE model using sklearn.
        # sklearn Documentation:
        # https://scikit-learn.org/stable/modules/generated/
        # sklearn.manifold.TSNE.html
        # Args:
        #   n_components: Dimension of the embedded space.
        #   perplexity: The perplexity is related to the number of nearest
        #   neighbors that is used in other manifold learning algorithms.
        #   Larger datasets usually require a larger perplexity. Consider
        #   selecting a value between 5 and 50. Different values can result
        #   in significantly different results.
        #   learning_rate: The learning rate for t-SNE is usually in the
        #   range [10.0, 1000.0]. If the learning rate is too high, the data
        #   may look like a ‘ball’ with any point approximately equidistant
        #   from its nearest neighbours. If the learning rate is too low,
        #   most points may look compressed in a dense cloud with few outliers.
        #   If the cost function gets stuck in a bad local minimum increasing
        #   the learning rate may help.
        #   n_iter: Maximum number of iterations for the optimization. Should
        #   be at least 250.
        tsne_model = TSNE(n_components=2,
                          perplexity=40,
                          learning_rate=200.,
                          n_iter=250)

        # We fit the model to the dataset
        tsne_embedded = tsne_model.fit_transform(X)

        # Create a new aggregated dataframe to store the results
        tsne_df = pd.DataFrame(data=tsne_embedded, columns=['x', 'y'])
        tsne_df.reset_index(drop=True, inplace=True)
        # Join our labels to our 2-dimensional Array
        tsne_df = pd.concat([tsne_df, labels_df], axis=1, ignore_index=True)
        tsne_df.columns = ['x', 'y', 'name']

        # Generate some colour codes
        # Assign each name category a unique code which represents the color
        # of the markers in Plotly.
        tsne_df = tsne_df.assign(
            color=(tsne_df['name']).astype('category').cat.codes)

        # View the params used in the model and return that in the response
        params = tsne_model.get_params()

        response = {
            'status': 'success',
            'parameters': params,
            'data': {
                'x': tsne_df['x'].tolist(),
                'y': tsne_df['y'].tolist(),
                'label': tsne_df['name'].tolist(),
                'color': tsne_df['color'].tolist(),
            }
        }
        return response, 200
Beispiel #9
0
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 10 14:30:42 2014

@author: junhao
"""
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

df = pd.read_csv('iris.data',delimiter=',', header=None)
df = df.dropna()
print df.head()
c= df.iloc[:,4]

f = pd.Categorical.from_array(c)
print f.labels

model=TSNE(n_components=2, init='random', early_exaggeration=4, learning_rate=200, random_state=1)
xy= model.fit_transform(df.iloc[:,0:3])


plt.scatter(xy[:,0],xy[:,1], c=f.labels)
ax.legend()
plt.show()

print model.get_params()
Beispiel #10
0
                                    if "scalarMap" not in color_group:
                                        color_group.require_dataset(
                                            "scalarMap",
                                            shape=(1, ),
                                            dtype=vlen_uint8)[0] = pickle4h5(
                                                scalarMap)
                                        color_group.require_dataset(
                                            "colors",
                                            shape=point_colors.shape,
                                            dtype=point_colors.dtype)[
                                                ...] = point_colors

                                    ncomp_group = sample_group.require_group(
                                        f"{n_components}D")
                                    embed_group = ncomp_group
                                    for p_name, p_value in tsne.get_params(
                                    ).items():
                                        if p_name in useful_tsne_params:
                                            embed_group = embed_group.require_group(
                                                f"{p_name}: {p_value}")
                                    embed_group.require_dataset(
                                        "tsne", shape=(1, ),
                                        dtype=vlen_uint8)[0] = pickle4h5(tsne)
                                    embed_group.require_dataset(
                                        "embeded_points",
                                        shape=embeded_points.shape,
                                        dtype=embeded_points.dtype)[
                                            ...] = embeded_points
                                    embed_group.attrs[
                                        "embeded_pt_mean_dist"] = embeded_pt_mean_dist
                            except Exception as e:
                                print(f"Failed for some reason {e}")