def dimensionality_reduction( sample: pd.Series, background_df: pd.DataFrame, genes: List[str], col: str, method='trimap') -> Tuple[pd.DataFrame, hv.Scatter]: """ Wrapper for returning trimap plot with column for `color_index` and `size_index` Args: sample: n-of-1 sample. Gets own label background_df: Background dataset genes: Genes to use in dimensionality reduction col: Column to use for color_index method: Method of dimensionality reduction. `trimap` or `tsne` Returns: Holoviews Scatter object of plot with associated vdims """ assert method == 'trimap' or method == 'tsne', '`method` must be either `trimap` or `tsne`' combined = background_df.append(sample) if method == 'trimap': reduced = trimap.TRIMAP().fit_transform(combined[genes]) else: reduced = t_sne.TSNE().fit_transform(combined[genes]) df = pd.DataFrame(reduced, columns=['x', 'y']) df[col] = background_df[col].tolist() + [f'N-of-1 - {sample[col]}'] df['size'] = [1 for _ in background_df[col]] + [5] return df, hv.Scatter(data=df, kdims=['x'], vdims=['y', col, 'size'])
def quest_16(_data): """ Pick some K (or several) apply the EM you’ve implemented to the single cell data with the geneset you had selected. Compare the results to k-means. Plot the clustering of the different methods on a 2D tSNE embedding and discuss your results. """ max_var_index = np.argsort(np.var(_data, axis=1))[-300:] _x = _data[max_var_index, :] m = SingleCellExpressionModel(C=_x.shape[1], G=_x.shape[0], T=K[0]) km = KMeans(n_clusters=K[0]).fit(_x.T) m.learn(_x, maxiter=100) posterior = m.type_posterior(_x) # plt.title('tSNE of k-means clustering colored by centroids') tsne = t_sne.TSNE(n_components=2).fit_transform(np.log(1 + _x.T)) plt.scatter(tsne[:, 0], tsne[:, 1], c=km.labels_, s=4) plt.xlim(np.percentile(tsne[:, 0], [1, 100])) plt.ylim(np.percentile(tsne[:, 1], [1, 100])) plt.show() # plt.title('tSNE of EM clustering colored by cell types') plt.scatter(tsne[:, 0], tsne[:, 1], c=np.argmax(posterior, axis=1), s=4) plt.xlim(np.percentile(tsne[:, 0], [1, 100])) plt.ylim(np.percentile(tsne[:, 1], [1, 100])) plt.show()
def _do_tsne(data, nr_components=2, init='random', plex=30, n_iter=1000, lr=200, rs=None): """Uses sklearn TSNE non-linear dimension reduction method Parameters ---------- data : ndarray nr_components : int, optional desired dimension, by default 2 init : str, optional see sklearn.manifold.t_sne documentation, by default 'random' plex : int, optional perplexity see sklearn.manifold.t_sne documentation, by default 30 n_iter : int, optional see sklearn.manifold.t_sne documentation, by default 1000 lr : int, optional learning_rate, see sklearn.manifold.t_sne documentation, by default 200 rs : int, optional random seed, by default None Returns ------- tuple, (transformed data, model) """ tsne = t_sne.TSNE(n_components=nr_components, init=init, perplexity=plex, random_state=rs, n_iter=n_iter, learning_rate=lr) return tsne.fit_transform(data), tsne
def dim_reduction(X, y, n_comp=2, fault_nodes=None): tsne = t_sne.TSNE(n_components=n_comp, perplexity=5) X_red = tsne.fit_transform(X) if fault_nodes: flt_plots = [] X_tsne = [] y_tsne = [] for node in fault_nodes: X_node = [X_red[n, :] for n in range(len(X_red)) if y[n] == node] X_tsne.extend(X_node) y_node = [y_train[n] for n in range(len(X_red)) if y[n] == node] y_tsne.extend(y_node) X_1 = [X_red[n, 0] for n in range(len(X_red)) if y[n] == node] X_2 = [X_red[n, 1] for n in range(len(X_red)) if y[n] == node] fig = plt.scatter(X_1, X_2) flt_plots.append(fig) leg = ["f{}".format(n) for n in fault_nodes] plt.legend(leg) plt.show() else: return X_red
def quest_11(_data): """ Plots of question 11 supplementary code (tSNE vs. PCA) """ tSNE = t_sne.TSNE(n_components=2).fit_transform(np.log(1 + _data.T)) pca = PCA(n_components=2).fit(np.log(1 + _data.T)).transform(np.log(1 + _data.T)) plt.figure(figsize=(8, 4)) plt.subplot(121) c = stats.gaussian_kde(tSNE.T)(tSNE.T) plt.scatter(tSNE[:, 0], tSNE[:, 1], c=c, s=3) plt.title('tSNE') plt.xlim(np.percentile(tSNE[:, 0], [1, 100])) plt.ylim(np.percentile(tSNE[:, 1], [1, 100])) plt.xticks([]) plt.yticks([]) plt.subplot(122) c = stats.gaussian_kde(pca.T)(pca.T) plt.scatter(pca[:, 0], pca[:, 1], c=c, s=3) plt.title('PCA') plt.xlim(np.percentile(pca[:, 0], [1, 100])) plt.ylim(np.percentile(pca[:, 1], [1, 100])) plt.xticks([]) plt.yticks([]) plt.show()
def _test_Tsne(synthetic_data): ''' Example code to show you how to load the MNIST data and plot it. ''' # load the MNIST data: digits = datasets.load_digits() data = digits.data / 255. tsne2 = tsne.TSNE() trained_data = tsne2.fit_transform(data) plot_with_images(trained_data, data, "Digits example- T-SNE") trained_data2 = tsne2.fit_transform(synthetic_data) plt.figure() plt.title("Synthetic data- TSNE") plt.scatter(trained_data2[:, 0], trained_data2[:, 1]) plt.show()
def make_tsne(data, inital_player): tsne = t_sne.TSNE(n_components=2, learning_rate=750) X_std = StandardScaler().fit_transform(data) fit = tsne.fit_transform(X_std) trace1 = go.Scatter(showlegend=False, x=fit[:, 0], y=fit[:, 1], text=data.index.get_level_values(0).values, hoverinfo='text', mode='markers', marker=dict( size=12, color='#1F77AA', )) num = data.index.get_loc(inital_player) trace2 = go.Scatter(name=inital_player, x=[fit[num, 0]], y=[fit[num, 1]], text=inital_player, hoverinfo='text', mode='markers', marker=dict(size=14, )) data1 = [trace1, trace2] layout = go.Layout({ 'hovermode': 'closest', 'margin': { 't': 0, 'r': 0, 'l': 0, 'b': 0 }, 'legend': { 'x': 0, 'y': 1 }, 'paper_bgcolor': '#F8F3F1', 'plot_bgcolor': '#F8F3F1' }) fig = go.Figure(data=data1, layout=layout) return fig
def patches_tsne_visualization(patches_vector, patches_mask, ratio_thold=0.1, savedir=None, n_iter=5000): from sklearn.manifold import t_sne import pandas as pd mask_labels = [] for mask in patches_mask: ratio = mask.sum() / float(mask.size) if ratio == 0: label = 'good' elif ratio >= ratio_thold: label = 'defect' else: label = 'neutral' mask_labels.append(label) mask_labels = np.array(mask_labels) points_num, source_dims = patches_vector.shape sp = sns.color_palette('muted') color_palette = {'good': sp[2], 'defect': sp[3], 'neutral': sp[7]} for perplexity in [30, 40, 50]: sk_tsne = t_sne.TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter, init='random', verbose=1) embedded = sk_tsne.fit_transform(patches_vector) df = pd.DataFrame(embedded, columns=('x', 'y')) df['label'] = mask_labels plt.figure(figsize=(12, 8)) plt.title('Dimension:%d Perplexity:%d Iteration:%d' % (source_dims, perplexity, n_iter)) ax = sns.scatterplot(x='x', y='y', hue='label', data=df, palette=color_palette) ax.xaxis.info.set_visible(0) ax.yaxis.info.set_visible(0) plt.savefig(path.join(savedir, 'tsne_p%d.png' % perplexity))
def tsne_plot(X, y, perplexity=20, title='t-SNE', label_idxs=[0, 1, 2, 3], label_names=['Right', 'Left', 'Tongue', 'Feet'], fig=None, ax_idx=0): ''' Visualize the EEG trials in optimized 2D embedding space. Input: - X: EEG trials (numpy array of shape (n_trials, n_channels, n_samples)). - y: Labels (numpy array of shape (n_trials,)). - perplexity: Hyperparameter for t-SNE (int). - embedding_name: Name of the embedding (string). Output: - 2D scatter plot (feature 1 vs feature 2). ''' from sklearn.manifold import t_sne if fig: ax = fig.get_axes()[ax_idx] else: fig, ax = plt.subplots() n_trials = X.shape[0] out = t_sne.TSNE(n_components=2, perplexity=perplexity, n_iter=500, random_state=0).fit_transform(X.reshape((n_trials, -1))) outs = np.array([out[y == c] for c in label_idxs]) colors = np.array(['b', 'r', 'g', 'orange']) [ ax.scatter(*outs[i][:, :].T, c=colors[l], alpha=0.7) for i, l in enumerate(label_idxs) ] ax.set_xlabel('Feature 1') ax.set_ylabel('Feature 2') ax.legend([label_names[l] for l in label_idxs], loc=0) ax.set_title(title) return fig
# Use the 2017 data and fill any NaNs recents = data[data.Year == 2017] recents = recents.dropna(axis=1, how="all") recents = recents.fillna(recents.median()) # Use only these specific features columns = [ 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect', 'Confidence in national government', 'Democratic Quality', 'Delivery Quality' ] # Transform the data with TSNE tsne = t_sne.TSNE() transformed = pd.DataFrame(tsne.fit_transform(recents[columns])) # Create the data object cluster_data = oe.data(transformed, [0, 1]) # Create the ensemble ensemble = oe.cluster(cluster_data) for i in range(20): name = f'kmeans({i}-tsne' ensemble.cluster('parent', 'kmeans', name, 10) # Create the cluster labels preds = ensemble.finish_co_occ_linkage(threshold=0.5) # Add Life Ladder to columns columns = [
create_scatter(key, 4, 3, i) i += 1 t = data[data['Year'] == 2005].copy() countries = list(t['Country name'].values) filtered = data[data['Country name'].isin(countries)] filtered[['Year', 'Life Ladder']].set_index('Year').boxplot(by='Year', grid=False) plt.suptitle("") plt.title('Life Ladder - Same Countries') plt.xlabel('Year') from sklearn.manifold import t_sne t = t_sne.TSNE() data = data.fillna(data.median()) transformed = t.fit_transform(data[[ 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect', 'Confidence in national government', 'Democratic Quality', 'Delivery Quality' ]].values) plt.scatter(transformed[:, 0], transformed[:, 1], c=data['Life Ladder'].values) regions = {x: 0 for x in regs.Region.unique()} i = 0 for r in regions: regions[r] = i i += 1
keep='last')] stats_for_similar[ 'FGA/3A'] = stats_for_similar['FGA'] / stats_for_similar['3PA'] stats_for_similar = stats_for_similar[cols_for_sim] stats_for_similar = stats_for_similar.select_dtypes(['number']) #fig = make_tsne(s, 'Stephen Curry') #fig1 = make_tsne(s, 'Kevin Durant') cph = CoxPHFitter() cph.fit(stats_for_surv, 'NBA_Experience', event_col='active') #fit model once at the begining tsne = t_sne.TSNE(n_components=2, learning_rate=750) #fit tsne at begining X_std = StandardScaler().fit_transform(stats_for_tsne) fit = tsne.fit_transform(X_std) kn = NearestNeighbors(n_neighbors=5) drop = ['FG_pg', '2P_pg', '3P_pg', 'FT_pg'] stand = StandardScaler() scaled = stand.fit_transform(stats_for_surv) kn.fit(scaled) counter = 0 stats_for_similar = stats_for_surv.copy() stats_for_similar.index = stats_for_similar.index.str.replace('.', '') stats_for_similar.index = stats_for_similar.index.str.replace("'", '')