def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]): ''' Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization. ''' print( '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}' .format(num_clusters)) for k in num_clusters: start = datetime.now() svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced = lsa.fit_transform(tfidf) # next, apply kmeans to the corpus to get labels clusters = KMeans(n_clusters=k, init='k-means++') clusters.fit(reduced) tsne = TSNEVisualizer(decompose=None) tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_]) tsne.finalize() filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str( k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png' plt.savefig(filename) plt.close() end = datetime.now() print(' ' + filename) print(" Time taken: {}".format(end - start))
def tsne_visualization(dataset_object, num_examples): """ Produce and save T-SNE visualization of feature vectors for given dataset Parameters ---------- dataset_object: tv_net.dataset.Dataset dataset object containing feature vectors and class names num_examples: int number of examples to plot """ dataset_object.shuffle_examples( ) # shuffle so that we don't get all one class feature_vectors = np.array( [item.feature_vector for item in dataset_object.items[:num_examples]]) label_list = [ item.class_name for item in dataset_object.items[:num_examples] ] title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format( num_examples) tsne = TSNEVisualizer(colormap='rainbow', title=title) tsne.fit(feature_vectors, label_list) output_path = os.path.join(dataset_object.config.OUTPUT_DIR, 'visualizations', 'feature_vector_tsne.png') tsne.show(outpath=output_path) tsne.show() # have to repeat to show and save
def result(self): data_df = self.clean_data.data() all_data_df = self.clean_data.getSpambase_data() target_df = self.clean_data.target() # Defining Model model = TSNE(learning_rate=100) # Fitting Model transformed = model.fit_transform(all_data_df) # Plotting 2d t-Sne x_axis = transformed[:, 0] pprint.pprint(x_axis) y_axis = transformed[:, 1] pprint.pprint(y_axis) plt.scatter(x_axis, y_axis, c=target_df) #plt.show() plt.savefig(self.file_png) # Create the visualizer and draw the vectors tfidf = TfidfVectorizer() docs = tfidf.fit_transform(data_df) tsne = TSNEVisualizer() tsne.fit(docs, target_df) tsne.poof()
def tsne(): corpus = load_hobbies() docs = TfidfVectorizer().fit_transform(corpus.data) oz = TSNEVisualizer(ax=newfig()) oz.fit(docs, corpus.target) savefig(oz, "corpus_tsne")
def text_cluster_tsne(text_vector, TextVectorizer=TfidfVectorizer, text_kwargs=text_kwargs, n_clusters=10, labels=None): '''Uses a TextVectorizer to transform the text contained (at the sentence or paragraph level) in the text_vector arg to produce a TSNE visualization. The label for the final plot is clusters produced from KMeans if labels are not passed. ARGS: text_vector <np.array>: Vector of text units. Must be type str. KWARGS: TextVectorizer <sklearn.feature_extraction.text>: Transformer. text_kwargs <dict>: kwargs to pass to TextVectorizer n_clusters <int>: If not using labels, number of clusters in KMeans labels <np.array>: True categorical labels. Discrete. RETURNS: None, prints visualizations to the console. ''' txt_vctzr = TextVectorizer(**text_kwargs) docs = txt_vctzr.fit_transform(text_vector) tsne = TSNEVisualizer() if labels is None: # derive clusters if labels not provided clusters = KMeans(n_clusters=n_clusters) clusters.fit(docs) tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_]) else: # otherwise use labels tsne.fit(docs, labels) sns.despine() tsne.poof()
def perform_tsne(X, Y, vec=None, outpath="", clusterLabels=False, savePlot=False): if vec == None: vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) docs = vec.fit_transform(X) labels = Y # from yellowbrick.text import TSNEVisualizer tsne = TSNEVisualizer() if clusterLabels: tsne.fit(docs, ["c{}".format(c) for c in Y]) # where Y=clusters.labels_ else: tsne.fit(docs, labels) if savePlot: # tsne.finalize() tsne.poof(outpath=outpath) else: tsne.poof()
def tsne_pack(c, l): my_title = "t-SNE Plot of " + c + " feature" data = df.filter(like=c) tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof()
def clusters_tsne(self, labels: pd.Series, title: str = 'title'): tsne = TSNEVisualizer(random_state=42) tsne.fit(self.vectors, labels) f = tsne.show().figure f.set_figheight(15) f.set_figwidth(15) f.suptitle(title) return f
def tsne(docs, target, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Visualize the frequency distribution visualizer = TSNEVisualizer(ax=ax, **kwargs) visualizer.fit(docs, target) visualizer.poof(outpath=outpath)
def analyse_2_step_model(): X_test = np.load( "test_prepared.npy").item() # this is our Single point of truth #test_silhouette(30, X_test) test = X_test[0:1000] prediction = test_entire_model()[0:1000] vis_shilouette(test, prediction) plt.savefig("silhouette.png") tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction)))) tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction]) tsne.poof(outpath="tsne.png")
def tsne(c, l): my_title = "t-SNE Plot of final model" data = c tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof() # %%time figure(figsize=(20, 10)) tsne(final, label_bias3) # %%time figure(figsize=(20, 10)) tsne(final, label_fact)
def plot_tsne_clusters(corpus, fileids=None, labels=None): from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer words = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (sent for title in normalizer.transform(words) for sent in title) # normed = (dd for dd in normalizer.transform(docs)) tfidf = TfidfVectorizer() procd = tfidf.fit_transform(normed) tsne = TSNEVisualizer() if labels is None: tsne.fit(procd) else: tsne.fit(procd, ["c{}".format(c) for c in labels]) tsne.poof()
def tsne(ax, classes=True): from sklearn.feature_extraction.text import TfidfVectorizer from yellowbrick.text import TSNEVisualizer X, y = load_data("hobbies", text=True) if not classes: y = None freq = TfidfVectorizer(input='filename', stop_words='english') X = freq.fit_transform(X) visualizer = TSNEVisualizer(ax=ax) visualizer.title = "t-SNE Projection of the Hobbies Corpus" if not classes: visualizer.title = "Unlabeled " + visualizer.title visualizer.fit(X, y) return visualizer
def generate_tsne(title, X, labels): fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2)) title_dic = {'fontsize': 7, 'fontweight': 'bold'} colors = resolve_colors(11, 'Spectral_r') colors2 = resolve_colors(10, 'BrBG_r') tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None) tsne.fit(X, labels) tsne.finalize() ax1 = tsne.ax ax1.set_title(title, title_dic) path = os.path.join(OUTPUT) filename = title filename = os.path.join(path, filename) plt.savefig(filename)
def cluster(corpus, k): y = [i[0] for i in corpus] corpus = [i[1] for i in corpus] eng = list(set(stopwords.words('english'))) trump = [ 'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build', 'thank', 'people' ] s_w = eng + trump vectorizer = TfidfVectorizer(stop_words=s_w) vectorizer.fit(corpus) features = vectorizer.transform(corpus) tsne = TSNEVisualizer() tsne.fit(features, y) tsne.show()
def tsne_plot(self, outpath, sample_size=1000, tfidf=True): """ Creates a png file at `outpath` with t-SNE visualization. `sample_size` determines the size of the random sample from each label. Uses TfidfVectorizer by default; if `tfidf` is set to False, CountVectorizer is used. ----------------------------------------------------------------------- More info: https://www.scikit-yb.org/en/latest/api/text/tsne.html https://lvdmaaten.github.io/tsne/ """ if self.tokenizer is None: print('No tokenizer was loaded.') return None df = pd.DataFrame(columns=self.data.columns) for label in self.labels: samp_df = self.data \ .query("Label == @label") \ .sample(sample_size, random_state=19) df = df.append(samp_df, ignore_index=True) # vectorize if tfidf: vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize) else: vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize) X = vectorizer.fit_transform(df.Text) y = df.Label # create the visualizer and draw the vectors tsne = TSNEVisualizer() tsne.fit(X, y) tsne.show(outpath=outpath) return None
def analyse_results(): rerun = False if ("rerun" in sys.argv): print("Redo everything") rerun = True X_test = np.load("test_prepared.npy").item() results = [] names = [] for filename in os.listdir("results"): if filename.endswith(".npy"): if filename[:-4] + "tsne.png" in os.listdir( "results") and not rerun: continue results.append(np.load("results/" + filename)) names.append(filename[:-4]) for i in range(len(results)): print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " + names[i]) vis_shilouette(X_test, results[i]) plt.savefig("results/" + names[i] + "silhouette.png") plt.close() plt.figure() tsne = TSNEVisualizer(colormap=cm.get_cmap( 'jet', len(set(results[i][0:5000]))), alpha=0.5, random_state=45) # make it deterministic tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]]) tsne.poof(outpath="results/" + names[i] + "tsne.png", clear_figure=True)
# freq_dist_viz(vectorizer, df_train['Lyrics'], # "images/tfid_stopwords_train.png") # freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png") def get_sentence_embedding(w2v_model, sentence): embedding = np.zeros(3000) for word in sentence.split(): try: vector = w2v_model.wv.get_vector(word) except KeyError: vector = np.zeros(3000) embedding += vector return embedding / len(sentence.split()) w2v_model = Word2Vec.load("word2vec_models/word2vec.model") docs = np.array([ get_sentence_embedding(w2v_model, sentence) for sentence in df_train['Lyrics'] ]) # tfidf = TfidfVectorizer() # docs = tfidf.fit_transform(X) labels = df_train['Genre'] tsne = TSNEVisualizer() tsne.fit(docs, labels) tsne.poof("images/w2v_tsne.png")
#!/usr/bin/env python3 import pickle from yellowbrick.text import TSNEVisualizer with open('data/agorb.csv', 'rb') as file: agora = pickle.load(file) with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file: X = pickle.load(file) with open('data/tno/categorieen.pkl', 'rb') as file: c = pickle.load(file) tsne = TSNEVisualizer() tsne.fit(X, c) tsne.show()
def main(X_train_smart, X_test_smart, y_train_smart, y_test_smart, X_train_bank, X_test_bank, y_train_bank, y_test_bank, args): # em = KMeans(n_clusters=4, random_state=27) # em.fit(X_train_smart) # prediction = em.predict(X_train_smart) # viz = RadViz() # viz.fit_transform(X_train_smart, prediction) # viz.show() # umap = UMAPVisualizer() # umap.fit(X_train_smart, ["c{}".format(c) for c in prediction]) # umap.show() # tsne = TSNEVisualizer(decompose_by=4) # tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction]) # tsne.show() # exit() sil_score_list_smart = [] cal_har_score_list_smart = [] davies_bouldin_score_list_smart = [] sil_score_list_bank = [] cal_har_score_list_bank = [] davies_bouldin_score_list_bank = [] num_clusters_list = np.arange(2, 25) for num_clusters in num_clusters_list: k_means = KMeans(n_clusters=num_clusters, random_state=27) k_means.fit(X_train_smart) prediction = k_means.predict(X_train_smart) # print(prediction) sil_score_list_smart.append(silhouette_score(X_train_smart, prediction)) cal_har_score_list_smart.append( calinski_harabasz_score(X_train_smart, prediction)) davies_bouldin_score_list_smart.append( davies_bouldin_score(X_train_smart, prediction)) for num_clusters in num_clusters_list: k_means = KMeans(n_clusters=num_clusters, random_state=27) k_means.fit(X_train_bank) prediction = k_means.predict(X_train_bank) # print(prediction) sil_score_list_bank.append(silhouette_score(X_train_bank, prediction)) cal_har_score_list_bank.append( calinski_harabasz_score(X_train_bank, prediction)) davies_bouldin_score_list_bank.append( davies_bouldin_score(X_train_bank, prediction)) with open('experiment_best.json') as f: params = json.load(f) if args.dimensionality is None: num_clusters_smart = params['k_means']['smart'] num_clusters_bank = params['k_means']['bank'] else: num_clusters_smart = params[args.dimensionality[0]]['k_means']['smart'] num_clusters_bank = params[args.dimensionality[0]]['k_means']['bank'] # Scale these for plotting cal_har_score_list_smart = [x / 500 for x in cal_har_score_list_smart] cal_har_score_list_bank = [x / 500 for x in cal_har_score_list_bank] davies_bouldin_score_list_smart = [ x / 5 for x in davies_bouldin_score_list_smart ] davies_bouldin_score_list_bank = [ x / 5 for x in davies_bouldin_score_list_bank ] plt.rc("font", size=8) plt.rc("axes", titlesize=12) plt.rc("axes", labelsize=10) plt.rc("xtick", labelsize=8) plt.rc("ytick", labelsize=8) plt.rc("legend", fontsize=8) plt.rc("figure", titlesize=11) #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4)) fig, ax = plt.subplots(1, 4, figsize=(15, 4)) fig.suptitle( 'K-Means Clusters - # of clusters Analysis (Left: Smart Grid, Right: Bank Loan)', fontsize=14) # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1) # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1) ax[0].plot(num_clusters_list, sil_score_list_smart, 'b-', label='Silhouette', linewidth=1) ax[0].plot(num_clusters_list, cal_har_score_list_smart, 'r--', label='Calinksi-Harabasz / 500', linewidth=1) ax[0].plot(num_clusters_list, davies_bouldin_score_list_smart, 'g-.', label='Davies-Bouldin / 5', linewidth=1) ax[0].set(xlabel='K (# of clusters)', ylabel='Scores') ax[0].set_title('Clustering Scores') ax[0].legend() k_means = KMeans(n_clusters=num_clusters_smart, random_state=27) k_means.fit(X_train_smart) prediction_smart = k_means.predict(X_train_smart) tsne = TSNEVisualizer(decompose_by=X_train_smart.shape[1] - 1, ax=ax[1], random_state=27) tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction_smart]) ax[1].set_title( 'tSNE Projection (clusters = {0})'.format(num_clusters_smart)) ax[1].set_xticklabels([]) ax[1].set_yticklabels([]) ax[2].plot(num_clusters_list, sil_score_list_bank, 'b-', label='Silhouette', linewidth=1) ax[2].plot(num_clusters_list, cal_har_score_list_bank, 'r--', label='Calinksi-Harabasz / 5d00', linewidth=1) ax[2].plot(num_clusters_list, davies_bouldin_score_list_bank, 'g-.', label='Davies-Bouldin / 5', linewidth=1) ax[2].set(xlabel='K (# of clusters)', ylabel='Scores') ax[2].set_title('Clustering Scores') ax[2].legend() k_means = KMeans(n_clusters=num_clusters_bank, random_state=27) k_means.fit(X_train_bank) prediction_bank = k_means.predict(X_train_bank) tsne_bank = TSNEVisualizer(decompose_by=X_train_bank.shape[1] - 1, ax=ax[3], random_state=27) tsne_bank.fit(X_train_bank, ["c{}".format(c) for c in prediction_bank]) ax[3].set_title( 'tSNE Projection (clusters = {0})'.format(num_clusters_bank)) ax[3].set_xticklabels([]) ax[3].set_yticklabels([]) plt.show() # Boosting validation # Smart grid boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_smart, y_train_smart) boost_fit_time = time() - boost_fit_t print('Boosting baseline fit time (smart): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_smart) boost_pred_time = time() - boost_pred_t print('Boosting baseline predict time (smart): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_smart, y_train_smart, cv=10) print('Boosting baseline cross validation score (smart): ' + str(np.mean(boost_score))) # boost_accuracy = accuracy(boosting_learner, y_test, boost_pred) # print('Boosting baseline test set predict accuracy: ' + str(boost_accuracy)) boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_smart, prediction_smart) boost_fit_time = time() - boost_fit_t print('Boosting DR + cluster fit time (smart): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_smart) boost_pred_time = time() - boost_pred_t print('Boosting DR + cluster predict time (smart): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_smart, prediction_smart, cv=10) print('Boosting DR + cluster cross validation score (smart): ' + str(np.mean(boost_score))) # Bank loan boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_bank, y_train_bank) boost_fit_time = time() - boost_fit_t print('Boosting baseline fit time (bank): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_bank) boost_pred_time = time() - boost_pred_t print('Boosting baseline predict time (bank): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_bank, y_train_bank, cv=10) print('Boosting baseline cross validation score (bank): ' + str(np.mean(boost_score))) boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100) boost_fit_t = time() boosting_learner.fit(X_train_bank, prediction_bank) boost_fit_time = time() - boost_fit_t print('Boosting DR + cluster fit time (bank): ' + str(boost_fit_time)) boost_pred_t = time() boost_pred = boosting_learner.predict(X_test_bank) boost_pred_time = time() - boost_pred_t print('Boosting DR + cluster predict time (bank): ' + str(boost_pred_time)) boost_score = cross_val_score(boosting_learner, X_train_bank, prediction_bank, cv=10) print('Boosting DR + cluster cross validation score (bank): ' + str(np.mean(boost_score))) return
n_jobs=num_threads) kmeans = kmeans.fit(X) pickle.dump(kmeans, open(f"cache/{thread_id}/{thread_id}_fitted_kmeans.pk", 'wb')) y = kmeans.labels_ if os.path.exists(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk"): print(f"Loading pre-trained tsne comments.") tsne = pickle.load( open(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk", 'rb')) else: tsne = TSNEVisualizer(decompose_by=tsne_svd, n_iter=tsne_iterations, verbose=2) tsne = tsne.fit(X, y) pickle.dump(tsne, open(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk", 'wb')) tsne.colors = """#33000e, #660029, #bf0080, #660080, #0088ff, #00708c, #008066, #4cbf00, #735c00, #ff8800, #995200, #402200, #ff4400, #590000, #ff4073, #ff40f2, #7736d9, #101040, #233f8c, #36ced9, #36d98d, #538020, #b6bf30, #b22d2d, #733960, #8959b3, #1a2033, #46628c, #73bfe6, #1a2e33, #204020, #ffd580, #f29979, #8c5946, #cc99c2, #bfbfff, #698c8a, #eaffbf, #8c8569, #4d4439, #bfa38f, #e6acac""".split( ', ') tsne.draw(tsne.vecs, y, point_annotations=[ f"{document}\n{author}" for document, author in zip(documents, authors) ]) with open(f'cache/{thread_id}/{thread_id}_clustered.tsv', 'w') as file: csv_writer = csv.writer(file, delimiter='\t',
def plotData(base, labels=[-1,0,1]): vectorizer = CountVectorizer(lowercase=False) tweets = vectorizer.fit_transform(base['Tweet']) tsne = TSNEVisualizer() tsne.fit(tweets, labels) tsne.poof()
color = "#000000" colormap.append(color) for label in labels: big_colormap.append(mycolormap[label]) t6 = time.time() tsne = TSNEVisualizer(colormap='RdYlGn') tsne.fit(tfidf_matrix, labels) tsne.poof() t7 = time.time() print("time for TSNE and vis: " + str(t7-t6)) tsne.poof()
cbar=False, fmt='g') ####################visualisng Clusters ###########Dendogram for TF-IDF features from scipy.cluster.hierarchy import dendrogram, linkage np.set_printoptions(precision=6, suppress=True) H_cluster = linkage(tfidf_matrix, 'ward') plt.title('Dendogram') plt.xlabel('Data') plt.ylabel('Distance bewteen data points') dendrogram( H_cluster, truncate_mode='lastp', # show only the last p merged clusters p=13, # show only the last p merged clusters leaf_rotation=90., leaf_font_size=12., show_contracted= True, # to get a distribution impression in truncated branches ) plt.show() #########Scatter plot to visualise k-means clusters from yellowbrick.text import TSNEVisualizer tsne = TSNEVisualizer() tsne.fit(tfidf_matrix, ["c{}".format(c) for c in labels]) tsne.poof()
matrix = np.zeros([len(t), len(liste_galaxies)]) dirGalaxies = shelve.open(path + '/BDs/listeGalaxies') for galaxie in range(len(liste_galaxies)): for node in dirGalaxies[str(liste_galaxies[galaxie])]: matrix[index[node]][galaxie] += 1 matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])]) dirGalaxies.close() label = np.array([i for i in range(len(t))]) tsne = TSNEVisualizer(decompose='svd',decompose_by=15) tsne.fit(matrix, label) print(tsne.transformer_) tsne.poof() svd = TruncatedSVD(n_components=15) svd_matrix = svd.fit_transform(matrix) tsne = ts.TSNE() y = tsne.fit_transform(svd_matrix) kmeans = Kmeans(5,200,0.1) kmeans.fit(y) for i in range(kmeans.nb_cluster): print("Cluster ",i) print((np.where(kmeans.which_cluster == i))[0]) print() plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis') plt.title("Resultat du clustering")
files=files, data=data, target=target, ) # Load the data and create document vectors corpus = load_corpus('hobbies') tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) labels = corpus.target # Create a visualizer to simply see the vectors plotted in 2D tsne = TSNEVisualizer() tsne.fit(docs) tsne.poof() # Create a visualizer to see how k-means clustering grouped the docs from sklearn.cluster import KMeans clusters = KMeans(n_clusters=5) clusters.fit(docs) tsne = TSNEVisualizer() tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_]) tsne.poof() # Create a visualizer to see how the classes are distributed
from sklearn.cluster import KMeans from yellowbrick.text import TSNEVisualizer #LDA予測 pred_score = predict_lda(scene_docs) result = pd.DataFrame(pred_score) clusters = KMeans(n_clusters=10) clusters.fit(result.values) plt.figure(figsize=(10,10)) tsne = TSNEVisualizer() tsne.fit(result.values, ["c{}".format(c) for c in clusters.labels_]) tsne.poof()
def load_corpus(): c = Corpus("all_posts01.txt") return c corpus = load_corpus() #tfidf = TfidfVectorizer(stop_words='english') from sklearn.cluster import KMeans vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, use_idf=True) #transformer = TfidfTransformer() #tfidf = make_pipeline(hasher,transformer) docs = vectorizer.fit_transform(corpus.documents) print(docs) true_k = 500 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(docs) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() tsne = TSNEVisualizer(labels=["documents"]) tsne.fit(docs) tsne.poof()
from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False) dummies = vect.fit_transform(df['ingredients'].apply(','.join)) df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names()) print("Vocab Length: ", len(vect.get_feature_names())) print("All Data Shape: ", df.shape) df.index= df_index print("Number of Predictors: ", df.shape[0]) df.head() # Create the visualizer and draw the vectors plt.figure(figsize = [15,9]) tsne = TSNEVisualizer() tsne.fit(df.loc[traindex,:][:7000], y[:7000]) tsne.poof() X = df.loc[traindex,:] print("Number of Cuisine Types: ", y.nunique()) print("X Shape: ", X.shape) test_df = df.loc[testdex,:] print("Test DF Shape: ", test_df.shape) del df; gc.collect(); LogisticRegression().get_params().keys() model = LogisticRegression(multi_class= 'ovr') score = cross_validate(model, X, y, return_train_score=False) score["test_score"].mean()
def TSNE_graph(X_train, y_train): tsne = TSNEVisualizer() tsne.fit(X_train, y_train) tsne.poof()