def test_init_ndarray_precomputed(): # Initialize TSNE with ndarray and metric 'precomputed' # Make sure no FutureWarning is thrown from _fit tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed", square_distances=True) tsne.fit(np.zeros((100, 100)))
def pcAnalysis(X, Xtest, w=None, ncomp=2, useTSNE=False): """ PCA(TSNE """ if useTSNE: print "TSNE analysis for train/test" pca = TSNE(n_components=ncomp) else: print "PC analysis for train/test" pca = TruncatedSVD(n_components=ncomp) print pca pca.fit(X) X_all = pd.concat([Xtest, X]) X_r = pca.transform(X_all.values) plt.scatter(X_r[len(Xtest.index):, 0], X_r[len(Xtest.index):, 1], c='r', label="train", alpha=0.5) plt.scatter(X_r[:len(Xtest.index), 0], X_r[:len(Xtest.index), 1], c='g', label="test", alpha=0.5) print("Total variance:", np.sum(pca.explained_variance_ratio_)) print("Explained variance:", pca.explained_variance_ratio_) plt.legend() plt.show()
def tsne_plot(gwbow, corp_dic): corp_sr = pd.Series(corp_dic) #300*60次元あるベクトルをt-sneで2次元へ tsne_model = TSNE(n_components=2, random_state=0, verbose=2) np.set_printoptions(suppress=True) #指数表記を禁止にして常に小数で表示 tsne_model.fit(gwbow) # 散布図の表示 skip = 0 limit = 4100 plain_tsne = pd.DataFrame(tsne_model.embedding_[skip:limit, 0], columns=["x"]) plain_tsne["y"] = pd.DataFrame(tsne_model.embedding_[skip:limit, 1]) plain_tsne['corp_name'] = corp_sr df_edinetcode = pd.read_csv('EdinetcodeDlInfo.csv', encoding='cp932', header=1, index_col=0) df_merge = pd.merge(plain_tsne, df_edinetcode, left_on='corp_name', right_on='EDINETコード') df_tsne = df_merge[['x', 'y', '提出者名']].copy() ax = df_tsne.plot.scatter(x="x", y="y", figsize=(10, 10), s=30) #各要素にラベルを表示 for k, v in df_tsne.iterrows(): ax.annotate(v[2], xy=(v[0], v[1]), size=15)
def project_tsne(self, projection_attrs): data = self._A_matrix if (projection_attrs): if (projection_attrs['pca']): pca = projection_attrs['pca'] if (projection_attrs['perplexity']): perplexity = projection_attrs['perplexity'] if (projection_attrs['theta']): theta = projection_attrs['theta'] else: # Standard configuration perplexity = 30.0 theta = 0.5 pca = False if (pca and data.shape[0] > 50): pca = PCA(n_components=50) pca.fit(data.T) data = pca.components_[0:50, :].T tsne = TSNE(n_components=2, perplexity=perplexity, method='barnes_hut', angle=theta, learning_rate=1000) tsne.fit(data) tsne_proj = tsne.embedding_[:, 0:2] return (tsne_proj)
class TSNE( BaseDR ): ''' tSNE ''' def __init__( self, n_components=2, perplexity=30.0 ): super( self.__class__, self ).__init__( n_components, Alg.TSNE, True ) from sklearn.manifold import TSNE self.tsne = TSNE( n_components=n_components, perplexity=perplexity ) def fit( self, X ): self.tsne.fit( X ) return self.tsne def transform( self, X ): return None def fit_transform( self, X ): return self.tsne.fit_transform( X ) def inverse_transform( self, A ): return None def project( self, X ): return None
def performDimensionalityReduction(context_vector, n_component, perplexity): ''' Applies TSNE on the feature vector of each of the word instances and creates one model for each word type ''' feature_vector_data = defaultdict(dict) word_type_model = {} for word_type, word_type_data in context_vector.iteritems(): feature_vector_word_type = OrderedDict() #Reading in all the feature vectors for the given word type for data_type, instance_details in word_type_data.iteritems(): for instance, context_details in instance_details.iteritems(): #Training data with have the sense id's while test data will have ['<UNKNOWN>'] senses = context_details.get('Sense') for sense in senses: feature_vector_word_type[(instance, sense, data_type)] = context_details["Feature_Vector"] #Applying TSNE on all the feature vectors feature_vector_array = np.array(feature_vector_word_type.values()) model = TSNE(n_components=n_component, random_state=0, perplexity=perplexity, metric="cosine") model.fit(feature_vector_array) #Storing the model since it will be needed to fit the test data word_type_model[word_type] = model #Converting to a structure of {WordType: {(instanceID, senseID): FeatureVector ... }} for i in range(len(feature_vector_word_type)): feature_vector_data[word_type][feature_vector_word_type.keys()[i]] = list(model.embedding_[i]) return feature_vector_word_type, word_type_model
def perform_tSNE_analys(n_samples=10e10, n_variables=10000, data_type='psi', filter_tissues=True, n_dimensions=2, perplexity=30, learning_rate=200, n_iter=1000): """ Performs the tSNE of the PSI/TPM values. It is used to visualize high-dimensional data, converting affinities of data points to probabilities using t-Students distributions.""" data, labels = read_psi_and_recover_tissue(n_samples=n_samples, n_variables=10000, data_type=data_type, filter_tissues=filter_tissues) X_train, y_train = generate_sets(data, labels, do_not_split=True) tsne = TSNE(n_components=n_dimensions, perplexity=perplexity, early_exaggeration=12.0, learning_rate=learning_rate, n_iter=n_iter, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='random', verbose=1, random_state=None) tsne.fit(X_train.values) results = tsne.embedding_ results = pandas.DataFrame( results, columns=[str(x) + 'D' for x in range(1, n_dimensions + 1)], index=y_train.index) results = pandas.concat([results, y_train.idxmax(1)], axis=1) results = results.rename(columns={0: 'Tissue'}) plot_by_group(results.groupby('Tissue'), '1D', '2D')
def tSNE_method(axes, user_xaxis, user_yaxis, clusters): """ Навчання за методом t-SNE та підготовка результатів для друку на виводу графіку """ # Визначення моделі та швидкості навчання model = TSNE() # навчання моделі transformed = model.fit_transform(iris_df.data) model = KMeans(n_clusters=clusters) model.fit(transformed) # Передбачення на всьому наборі даних all_predictions = model.predict(transformed) # Розділення набору даних x_axis = transformed[:, user_xaxis] y_axis = transformed[:, user_yaxis] axes[1][0].scatter(x_axis, y_axis, c=all_predictions) axes[1][0].set_xlabel('Метод К-середніх зі зменш. розм.') return 'Передбачені міткі (Метод К-cередніх зі зменш. розм.):\n {}'.format( all_predictions)
def k_means(data_set, output_file, png_file, t_labels, score_file, set_name): model = cluster.KMeans(n_clusters=4, max_iter=100, n_jobs=4, init="k-means++") model.fit(data_set) # print(list(model.labels_)) p_labels = list(model.labels_) r = pd.concat( [data_set, pd.Series(model.labels_, index=data_set.index)], axis=1) r.columns = list(data_set.columns) + [u'聚类类别'] print(r) r.to_excel(output_file) with open(score_file, "a") as sf: sf.write("By k-means, the f-m_score of " + set_name + " is: " + str(metrics.fowlkes_mallows_score(t_labels, p_labels)) + "\n") sf.write("By k-means, the rand_score of " + set_name + " is: " + str(metrics.adjusted_rand_score(t_labels, p_labels)) + "\n") t_sne = TSNE() t_sne.fit(data_set) t_sne = pd.DataFrame(t_sne.embedding_, index=data_set.index) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False dd = t_sne[r[u'聚类类别'] == 0] plt.plot(dd[0], dd[1], 'r.') dd = t_sne[r[u'聚类类别'] == 1] plt.plot(dd[0], dd[1], 'go') dd = t_sne[r[u'聚类类别'] == 2] plt.plot(dd[0], dd[1], 'b*') dd = t_sne[r[u'聚类类别'] == 3] plt.plot(dd[0], dd[1], 'o') plt.savefig(png_file) plt.clf()
def S_cached_kl (perp, X=np.array([])): tsne = TSNE(perplexity=perp, random_state=42) t0 = time.perf_counter() tsne.fit(X) t1 = time.perf_counter() print('Last t-SNE took {} seconds'.format(t1-t0)) n = X.shape[0] return 2*tsne.kl_divergence_ + (math.log(n)*perp/n)
def test_init_ndarray_precomputed(): # Initialize TSNE with ndarray and metric 'precomputed' # Make sure no FutureWarning is thrown from _fit tsne = TSNE( init=np.zeros((100, 2)), metric="precomputed", learning_rate=50.0, ) tsne.fit(np.zeros((100, 100)))
def Tsne(self,): data_set=pd.read_csv(self.data_set_name,header=None,index_col=None) data_set=data_set.T tsne=TSNE(n_components=self.components) tsne.fit(data_set) data_set=tsne.fit_transform(data_set) print("Generate Dre_data.csv." ) data_set=pd.DataFrame(data_set) data_set.to_csv(self.Dred_data,header=False,index=False) return 0
def drawing(word_vector, word_dict): tsne = TSNE(n_components=2) tsne.fit(word_vector[0:1000, :]) word_embedding = tsne.embedding_ print word_embedding.shape fig = plt.figure() for idx in range(word_embedding.shape[0]) : plt.plot(word_embedding[idx,0], word_embedding[idx,1], 'o-', color='#ef4136') plt.text(word_embedding[idx,0], word_embedding[idx,1], word_dict[idx], color='black', ha='left') plt.show()
class Tsne: """ This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of tsne. This implementation uses [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE). Important: TSNE does not allow you to train a transformation and re-use it. It must retrain every time it sees data. You may also notice that it is relatively slow. This unfortunately is a fact of life. Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the Tsne implementation, includes things like `perplexity` [link](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE) Usage: ```python from whatlies.language import SpacyLanguage from whatlies.transformers import Tsne words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = SpacyLanguage("en_core_web_md") emb = lang[words] emb.transform(Tsne(3)).plot_interactive_matrix('tsne_0', 'tsne_1', 'tsne_2') ``` """ def __init__(self, n_components=2, **kwargs): self.is_fitted = False self.n_components = n_components self.kwargs = kwargs self.tfm = TSNE(n_components=n_components, **kwargs) def __call__(self, embset): if not self.is_fitted: self.fit(embset) return self.transform(embset) def fit(self, embset): names, X = embset_to_X(embset=embset) self.tfm.fit(X) self.is_fitted = True def transform(self, embset): names, X = embset_to_X(embset=embset) new_vecs = self.tfm.fit_transform(X) names_out = names + [f"tsne_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
def viz_js_stations(df, manifold='MDS'): if manifold == 'TSNE': model = TSNE(metric='precomputed') elif manifold == 'MDS': model = MDS(dissimilarity='precomputed') else: raise ValueError('Unknown manifold method: {}'.format(manifold)) model.fit(df.values) p = figure() p = _axis_adjust(p) special_stations_1 = [ 789, 625, 248, 658, 404, 719, 785, 252, 111, 191, 307 ] special_stations_2 = [ 433, 393, 392, 361, 331, 214, 215, 193, 154, 140, 66, 41, 12 ] all_special = special_stations_1 + special_stations_2 size_mapper = lambda x: 20 if int(x) in all_special else 10 color_mapper = lambda x: brewer['PRGn'][11][10] if int(x) in special_stations_1 else \ (brewer['PRGn'][11][0] if int(x) in special_stations_2 else brewer['PRGn'][11][5]) sizes = [size_mapper(station) for station in df.index] colours = [color_mapper(station) for station in df.index] source = ColumnDataSource({ 'x': model.embedding_[:, 0], 'y': model.embedding_[:, 1], 'station_key': df.index, 'sizes': sizes, 'colours': colours }) p.circle(x='x', y='y', source=source, fill_color='colours', line_color=brewer['PRGn'][7][0], size='sizes', fill_alpha=0.6) #labels = LabelSet(x='x', y='y', text='station_key', level='glyph', # x_offset=5, y_offset=5, source=source, render_mode='canvas', # text_font_size='8px') p.add_tools(HoverTool(tooltips=[('station', '@station_key')])) p.xaxis.axis_label = 'MDS Embedded Coordinate 1' p.yaxis.axis_label = 'MDS Embedded Coordinate 2' p.yaxis.major_tick_line_color = None p.xaxis.major_tick_line_color = None #p.add_layout(labels) # p.add_layout(citation) show(p) return p
def tsne(G, vectors): vector_list = [] for key in vectors.keys(): vector_list.append(vectors[key]) nodes = list(G.nodes) tsne = TSNE(n_components=2) tsne.fit(vector_list) newX = tsne.fit_transform(vector_list) pos = {} for i in range(0, len(newX)): pos[nodes[i]] = newX[i] return pos
def tSNE_tackle(train_X, n_components): #fig = plt.figure('LDA') tsne = TSNE(n_components=n_components, verbose=1) tsne.fit(train_X) X_new = tsne.fit_transform((train_X)) #print("降维后各主成分的方差值与总方差之比:", tsne.explained_variance_ratio_) #print("降维后各主成分的方差值之和:", sum(tsne.explained_variance_ratio_)) #print("降维前样本数量和维度:",train_X.shape) #print("降维后样本数量和维度:",X_new.shape) #plt.show() return X_new
def plot_data(args, seq, original_seq=None): if args.delta: plt.figure() dist = np.sum((seq[1:, ...] - seq[:-1, ...])**2, axis=1)**0.5 plt.hist(dist) if args.save: plt.savefig(args.save, dpi=120) else: plt.show() return if args.pca: pca = PCA(n_components=args.pca) if original_seq is None: seq = pca.fit_transform(seq) else: original_seq = pca.fit_transform(original_seq) seq = pca.transform(seq) if args.tsne: tsne = TSNE(n_components=2, perplexity=30.0, n_iter=2000, verbose=2) if original_seq is None: seq = tsne.fit_transform(seq) else: tsne.fit(original_seq) seq = tsne.transform(seq) if seq.shape[1] == 2: plt.figure() x, y = zip(*seq[:, :]) color_list = cm.get_cmap(name="viridis") if args.strip: n, m = tuple(args.strip) for i in range(0, seq.shape[0] - 1, m): plt.plot(x[i:(i + n)], y[i:(i + n)], '-', color=color_list(i / (seq.shape[0] - 1))) else: for i in range(seq.shape[0] - 1): plt.plot(x[i:(i + 2)], y[i:(i + 2)], '.', color=color_list(i / (seq.shape[0] - 1))) plt.axis('equal') if args.save: plt.savefig(args.save, dpi=120) else: plt.show() else: print("Cannot plot sequence: data is of size {}".format(seq.shape))
def viz_js_stations_two(df, manifold='TSNE'): if manifold == 'TSNE': model = TSNE(metric='precomputed') elif manifold == 'MDS': model = MDS(dissimilarity='precomputed') else: raise ValueError('Unknown manifold method: {}'.format(manifold)) model.fit(df.values) p = figure() p = _axis_adjust(p) source = ColumnDataSource({ 'x': model.embedding_[:, 0], 'y': model.embedding_[:, 1], 'station_key': df.index.get_level_values(0), 'city': df.index.get_level_values(1) }) p.circle( x='x', y='y', source=source, fill_color=factor_cmap('city', [brewer['PRGn'][7][0], brewer['PRGn'][7][6]], ['London', 'Taipei']), line_color=factor_cmap('city', [brewer['PRGn'][7][0], brewer['PRGn'][7][6]], ['London', 'Taipei']), size=10.0, fill_alpha=0.6) labels = LabelSet(x='x', y='y', text='station_key', level='glyph', x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6px') p.add_tools(HoverTool(tooltips=[('station', '@station_key')])) p.xaxis.axis_label = 'MDS Embedded Coordinate 1' p.yaxis.axis_label = 'MDS Embedded Coordinate 2' p.yaxis.major_tick_line_color = None p.xaxis.major_tick_line_color = None #p.add_layout(labels) return p
def _fit_embedding(self, method = 'tsne', n_components = 2, random_state = 1, verbose = 2, n_neighbors = 15, min_dist = 0.1, **kwargs): """ parameters ----------------- method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D kwargs: the extra parameters for the conresponding algorithm """ dist_matrix = self.dist_matrix if 'metric' in kwargs.keys(): metric = kwargs.get('metric') kwargs.pop('metric') else: metric = 'precomputed' if method == 'tsne': embedded = TSNE(n_components=n_components, random_state=random_state, metric = metric, verbose = verbose, **kwargs) elif method == 'umap': embedded = UMAP(n_components = n_components, n_neighbors = n_neighbors, min_dist = min_dist, verbose = verbose, random_state=random_state, metric = metric, **kwargs) elif method =='mds': if 'metric' in kwargs.keys(): kwargs.pop('metric') if 'dissimilarity' in kwargs.keys(): dissimilarity = kwargs.get('dissimilarity') kwargs.pop('dissimilarity') else: dissimilarity = 'precomputed' embedded = MDS(metric = True, n_components= n_components, verbose = verbose, dissimilarity = dissimilarity, random_state = random_state, **kwargs) embedded = embedded.fit(dist_matrix) df = pd.DataFrame(embedded.embedding_, index = self.flist,columns=['x', 'y']) typemap = self.bitsinfo.set_index('IDs') df = df.join(typemap) df['Channels'] = df['Subtypes'] self.df_embedding = df self.embedded = embedded
def labtest_TSNE(PID): data = [patients[pid]['tests'] for pid in PID] X = pp.scale(data) tsne = TSNE(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-07, angle=0.5) pos = tsne.fit(X).embedding_ return pos
def test_reduction_to_one_component(): # t-SNE should allow reduction to one component (issue #4154). random_state = check_random_state(0) tsne = TSNE(n_components=1) X = random_state.randn(5, 2) X_embedded = tsne.fit(X).embedding_ assert(np.all(np.isfinite(X_embedded)))
def _fit_embedding(self, dist_matrix, method='umap', n_components=2, random_state=32, verbose=2, n_neighbors=15, min_dist=0.1, **kwargs): """ parameters ----------------- dist_matrix: distance matrix to fit method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D kwargs: the extra parameters for the conresponding algorithm """ if 'metric' in kwargs.keys(): metric = kwargs.get('metric') kwargs.pop('metric') else: metric = 'precomputed' if method == 'tsne': embedded = TSNE(n_components=n_components, random_state=random_state, metric=metric, verbose=verbose, **kwargs) elif method == 'umap': embedded = UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, verbose=verbose, random_state=random_state, metric=metric, **kwargs) elif method == 'mds': if 'metric' in kwargs.keys(): kwargs.pop('metric') if 'dissimilarity' in kwargs.keys(): dissimilarity = kwargs.get('dissimilarity') kwargs.pop('dissimilarity') else: dissimilarity = 'precomputed' embedded = MDS(metric=True, n_components=n_components, verbose=verbose, dissimilarity=dissimilarity, random_state=random_state, **kwargs) embedded = embedded.fit(dist_matrix) return embedded
def embed(self, M): """Embed a distance matrix using TSNE. Parameters ---------- M : :obj:`ndarray` The distance matrix to be embedded Returns ------- :obj:`ndarray` A :obj:`ndarray` of the embedding. """ tsne = TSNE(n_components=self.num_components, metric="precomputed") tsne.fit(M) emb = tsne.embedding_ return emb
def train_tsne(training_size=2000, metric='cosine', n_components=3, perplexity=100, angle=.12): # adjust this downward to see it it affects accuracy np = pd.np tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz')) tweets = tweets[tweets.isbot >= 0] gc.collect() # reclaim RAM released above # labels3 = tweets.isbot.apply(lambda x: int(x * 3)) labels = tweets.isbot.apply(lambda x: int(x * 2)) lsa = LsiModel.load( os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl')) tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word) bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text]) # tfidfs = tfidf[bows] X = pd.DataFrame( [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))], index=tweets.index) mask = ~X.isnull().any(axis=1) mask.index = tweets.index # >>> sum(~mask) # 99 # >>> tweets.loc[mask.argmin()] # isbot 0.17 # strict 13 # user b'CrisParanoid:' # text b'#sad again' # Name: 571, dtype: object X = X[mask] y = tweets.isbot[mask] labels = labels[mask] test_size = 1.0 - training_size if training_size < 1 else float( len(X) - training_size) / len(X) Xindex, Xindex_test, yindex, yindex_test = train_test_split( X.index.values, y.index.values, test_size=test_size) X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[ yindex], y.loc[yindex_test] # labels_test = labels.loc[yindex_test] labels = labels.loc[yindex] tsne = TSNE(metric='precomputed', n_components=n_components, angle=angle, perplexity=perplexity) tsne = tsne.fit(positive_distances(X.values, metric=metric)) return tsne, X, Xtest, y, ytest
def dimension_reduction_TSNE(arr0, n_components=2): matrix = np.array(arr0) t_sne = TSNE(n_components=n_components, random_state=0) np.set_printoptions(suppress=True) result = t_sne.fit(matrix) kl_divergence = result.kl_divergence_ # label = data_utility.retrieve_nan_index(t_sne.fit_transform(matrix).tolist(), index) label = t_sne.fit_transform(matrix).tolist() return label, kl_divergence
class TSNERepresentation(Representation): @staticmethod def default_config(): default_config = Representation.default_config() # parameters default_config.parameters = Dict() default_config.parameters.perplexity = 30.0 default_config.parameters.init = "random" default_config.parameters.random_state = None return default_config def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs): Representation.__init__(self, config=config, **kwargs) # input size (flatten) self.n_features = n_features # latent size self.n_latents = n_latents # feature range self.feature_range = (0.0, 1.0) self.algorithm = TSNE(n_components=self.n_latents) self.update_algorithm_parameters() def fit(self, X_train, update_range=True): ''' X_train: array-like (n_samples, n_features) ''' X_train = np.nan_to_num(X_train) if update_range: self.feature_range = (X_train.min(axis=0), X_train.max(axis=0)) # save (min, max) for normalization X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) self.algorithm.fit(X_train) def calc_embedding(self, x): x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) x = self.algorithm.transform(x) return x def update_algorithm_parameters(self): self.algorithm.set_params(**self.config.parameters, verbose=False)
def TSNE(X_train, y_train=None, X_test=None, n=100): from sklearn.manifold import TSNE mod = TSNE(n_components=n) X = mod.fit(X_train, y_train) test = mod.transform(X_train) if X_test is None: out = train else: test = pca.transform(X_test) out = train, test return out
def input_stats(df=None): '''Input dataframe; Enter parameter required; Output Descriptive statistics for each groups ;Using Tsne and KMeans as methods''' df_num = df.select_dtypes('number') ds = StandardScaler().fit_transform(df_num) data_scaled = pd.DataFrame(ds,columns=df_num.columns) print('Numeric shape of dataframe is: ',df_num.shape) tsne = TSNE() tsne.fit(data_scaled) te = tsne.embedding_ tsne_df = pd.DataFrame(te,columns=['e1','e2']) s,e = int(input('k range start:')), int(input('k range end:')) krange = range(s,e+1) inertia =[] silo = [] for k in krange: kmodel = KMeans(k) k_labs = kmodel.fit_predict(tsne_df) inertia.append(kmodel.inertia_) silo.append(silhouette_score(tsne_df,k_labs)) print('Be advice! You will have to choose k from below two graphs for next process!') sns.lineplot(krange,inertia) plt.title('k value and inertia') plt.show() sns.lineplot(krange, silo) plt.title('k value and silhouette score') plt.show() dfcopy = df.copy() k = int(input('input optimal k:')) km = KMeans(k) k_labs = km.fit_predict(tsne_df) dfcopy['kmeans_labels'] = k_labs return (dfcopy.groupby('kmeans_labels').mean().T)
def compute_cluster_color(nodes, vectors, k): vector_list = [] for key in vectors.keys(): vector_list.append(vectors[key]) tsne = TSNE(n_components=3) tsne.fit(vector_list) newX = tsne.fit_transform(vector_list) temp_vectors = {} temp_nodes = list(nodes) for i in range(0, len(newX)): temp_vectors[temp_nodes[i]] = newX[i] clusters = kmeans(vectors, K=k) # clusters = mean_shift(temp_vectors) # clusters = dbscan(vectors) # clusters = dbscan(temp_vectors) # clusters = optics(vectors) color_list = [] for node in nodes: c = COLOR_MAP[clusters[node]] color_list.append(c) return clusters, color_list
def data_embedding(self, type='TSNE'): ''' Fit distance matrix into two-dimensions embedded space using the TSNE or MDS model ''' if type == 'TSNE': model = TSNE(n_components=2, metric='precomputed') if type == 'MDS': model = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1) # position of points in embedding space pos = model.fit(self.distance_matrix).embedding_ return pos
def topic_classification_gensim_fit(filename_2, topic_number, top_idf_number, lda_model, common_dictionary): topic_1 = [0.00 for n in range(topic_number)] common_texts = process_doc(filename_2, top_idf_number) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] Y = [] for unseen_doc in common_corpus: vector = lda_model[unseen_doc] y = np.zeros(35) for vec in vector: topic_1[vec[0]] = topic_1[vec[0]]+vec[1] y[vec[0]] = vec[1] Y.append(y) Y = np.array(Y) tsne = TSNE(n_components=2) tsne.fit(Y) #print(tsne.embedding_) plt.plot(tsne.embedding_[:,0],tsne.embedding_[:,1]) plt.show() topic_1 = np.array(topic_1)/np.linalg.norm(topic_1) print(filename_2 + " word distribution:") print(topic_1) return topic_1
print("prefilter_train: ", prefilter_train.shape) print("prefilter_test: ", prefilter_test.shape) print("Performing PCA") X_pca = pca(prefilter_train) plotScatter(X_pca, y_train, title="6_PCA reduction (2d) of auto-encoded data (%dd)" % prefilter_train.shape[1]) print("Performing TSNE") model = TSNE(n_components=2, random_state=0, init="pca") toPlot = model.fit_transform(prefilter_train[:1000]) plotTSNE(toPlot, y_train[:1000], nb_classes, "7_t-SNE embedding of auto-encoded data ") print("Classifying and comparing") # Classify results from Autoencoder print("Building classical fully connected layer for classification") model = Sequential() model.add(Dense(prefilter_train.shape[1], nb_classes, activation=activation)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') model.fit(prefilter_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=False, verbose=0, validation_data=(prefilter_test, Y_test)) score = model.evaluate(prefilter_test, Y_test, verbose=0, show_accuracy=True) print('\nscore:', score) print('Loss change:', 100*(score[0] - classical_score[0])/classical_score[0], '%') print('Accuracy change:', 100*(score[1] - classical_score[1])/classical_score[1], '%')
def __plot_samples__(self, dfs, fold): """ :type dfs: List[pandas DataFrame] # [training df, testing df] :type fold: int :rtype: None """ mds = MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity='euclidean', n_jobs=-1) tsne = TSNE(n_components=2) # change label to color index # author 1 train (0 = light blue), author 1 test (1 = dark blue) # author 2 train (2 = light green), author 2 test (3 = dark green) df_all = pd.DataFrame(columns = dfs[0].columns) df0_copy = dfs[0].copy() df0_copy.loc[(df0_copy.label == 1).values, 'label'] = 0 df0_copy.loc[(df0_copy.label == -1).values, 'label'] = 2 df_all = df_all.append(df0_copy) df1_copy = dfs[1].copy() df1_copy.loc[(df1_copy.label == 1).values, 'label'] = 1 df1_copy.loc[(df1_copy.label == -1).values, 'label'] = 3 df_all = df_all.append(df1_copy) legend = {0: 'Author 1 Training Sample', 1: 'Author 1 Test Sample', 2: 'Author 2 Training Sample' , 3: 'Author 2 Test Sample' } # fit on training data pos_lst = [('Multi-Dimensional Scaling (MDS)', mds.fit(df_all.drop('label', axis=1)).embedding_), ('t-Distributed Stochastic Neighbor Embedding (TSNE)', tsne.fit(df_all.drop('label', axis=1)).embedding_)] # plot colors = sns.color_palette('Paired', 4) fig = plt.figure(figsize=(16,7)) plt.hold(True) for k, (title, pos) in enumerate(pos_lst, 1): ## fig.add_subplot() works in ipython notebook but creates a ## mysterious 3rd axes in python... # ax = fig.add_subplot(1,2,k) ax = plt.subplot(1,2,k) ax.set_title(title) for i in xrange(len(colors)): samples = pos[(df_all.label == i).values, :] ax.scatter(samples[:,0], samples[:,1], c=colors[i], edgecolor='none', label=legend[i]) ax.legend() plt.hold(False) plt.savefig('../figs/' + \ self.__PG_STATS_TBL__[self.__PG_STATS_TBL__.find("_")+1:] + \ 'fold' + str(fold) + '.png', dpi=300, transparent=True) plt.close(fig)
img = imread(df.local_path.loc[i]) if img.shape[0] < 200 or img.shape[1] < 200: df.drop(i) else: img_gray = color.rgb2gray(img) fd = hog(img_gray, orientations=9, pixels_per_cell=(8, 8),cells_per_block=(4, 4)) vector_list.append(fd) print i, len(fd), df.local_path.loc[i] X = np.vstack(vector_list) from sklearn.manifold import TSNE as tsne tsne = tsne(n_components=2) tsne.fit(X) subspace_tsne = pd.DataFrame(tsne.fit_transform(X),columns=["x","y"]) num_bins = 64 subspace_tsne['grid_x'] = pd.cut(subspace_tsne['x'],num_bins,labels=False) subspace_tsne['grid_y'] = pd.cut(subspace_tsne['y'],num_bins,labels=False) subspace_tsne['local_path'] = df.local_path[:len(subspace_tsne)] # I should save the dataframe here, so later maybe I can use full images thumb_side = 128 from PIL import Image
import matplotlib.pyplot as plt # matplotlib 1.4.3 from sklearn.manifold import TSNE # scikit-learn 0.17 import pandas # pandas 0.16.2 # Read data data = pandas.read_csv("data.csv", sep=",") # Fit model model = TSNE(n_components=2, perplexity=10, verbose=2, method='barnes_hut', init='pca', n_iter=1000) model.fit(data.values.T) # Plot results hFig, hAx = plt.subplots() hAx.scatter(model.embedding_[:, 0], model.embedding_[:, 1], 20, color="grey") for i, txt in enumerate(data.keys()): hAx.annotate(txt, (model.embedding_[i, 0], model.embedding_[i, 1]))